view x265/source/common/x86/dct8.asm @ 0:772086c29cc7

Initial import.
author Matti Hamalainen <ccr@tnsp.org>
date Wed, 16 Nov 2016 11:16:33 +0200
parents
children
line wrap: on
line source

;*****************************************************************************
;* Copyright (C) 2013 x265 project
;*
;* Authors: Nabajit Deka <nabajit@multicorewareinc.com>
;*          Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
;*          Li Cao <li@multicorewareinc.com>
;*          Praveen Kumar Tiwari <Praveen@multicorewareinc.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at license @ x265.com.
;*****************************************************************************/

;TO-DO : Further optimize the routines.

%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA 32
tab_dct8:       dw 64, 64, 64, 64, 64, 64, 64, 64
                dw 89, 75, 50, 18, -18, -50, -75, -89
                dw 83, 36, -36, -83, -83, -36, 36, 83
                dw 75, -18, -89, -50, 50, 89, 18, -75
                dw 64, -64, -64, 64, 64, -64, -64, 64
                dw 50, -89, 18, 75, -75, -18, 89, -50
                dw 36, -83, 83, -36, -36, 83, -83, 36
                dw 18, -50, 75, -89, 89, -75, 50, -18

dct8_shuf:      times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9

tab_dct16_1:    dw 64, 64, 64, 64, 64, 64, 64, 64
                dw 90, 87, 80, 70, 57, 43, 25,  9
                dw 89, 75, 50, 18, -18, -50, -75, -89
                dw 87, 57,  9, -43, -80, -90, -70, -25
                dw 83, 36, -36, -83, -83, -36, 36, 83
                dw 80,  9, -70, -87, -25, 57, 90, 43
                dw 75, -18, -89, -50, 50, 89, 18, -75
                dw 70, -43, -87,  9, 90, 25, -80, -57
                dw 64, -64, -64, 64, 64, -64, -64, 64
                dw 57, -80, -25, 90, -9, -87, 43, 70
                dw 50, -89, 18, 75, -75, -18, 89, -50
                dw 43, -90, 57, 25, -87, 70,  9, -80
                dw 36, -83, 83, -36, -36, 83, -83, 36
                dw 25, -70, 90, -80, 43,  9, -57, 87
                dw 18, -50, 75, -89, 89, -75, 50, -18
                dw  9, -25, 43, -57, 70, -80, 87, -90


tab_dct16_2:    dw 64, 64, 64, 64, 64, 64, 64, 64
                dw -9, -25, -43, -57, -70, -80, -87, -90
                dw -89, -75, -50, -18, 18, 50, 75, 89
                dw 25, 70, 90, 80, 43, -9, -57, -87
                dw 83, 36, -36, -83, -83, -36, 36, 83
                dw -43, -90, -57, 25, 87, 70, -9, -80
                dw -75, 18, 89, 50, -50, -89, -18, 75
                dw 57, 80, -25, -90, -9, 87, 43, -70
                dw 64, -64, -64, 64, 64, -64, -64, 64
                dw -70, -43, 87,  9, -90, 25, 80, -57
                dw -50, 89, -18, -75, 75, 18, -89, 50
                dw 80, -9, -70, 87, -25, -57, 90, -43
                dw 36, -83, 83, -36, -36, 83, -83, 36
                dw -87, 57, -9, -43, 80, -90, 70, -25
                dw -18, 50, -75, 89, -89, 75, -50, 18
                dw 90, -87, 80, -70, 57, -43, 25, -9

dct16_shuf1:     times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1

dct16_shuf2:    times 2 db 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9

tab_dct32_1:    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
                dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13,  4
                dw 90, 87, 80, 70, 57, 43, 25,  9, -9, -25, -43, -57, -70, -80, -87, -90
                dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
                dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
                dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
                dw 87, 57,  9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87
                dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
                dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
                dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67,  4, 73, 88, 38
                dw 80,  9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80
                dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
                dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
                dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
                dw 70, -43, -87,  9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70
                dw 67, -54, -78, 38, 85, -22, -90,  4, 90, 13, -88, -31, 82, 46, -73, -61
                dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
                dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
                dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87,  9, -90, 25, 80, -57
                dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
                dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
                dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82,  4, 78
                dw 43, -90, 57, 25, -87, 70,  9, -80, 80, -9, -70, 87, -25, -57, 90, -43
                dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
                dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
                dw 31, -78, 90, -61,  4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
                dw 25, -70, 90, -80, 43,  9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25
                dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
                dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
                dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31,  4, 22, -46, 67, -82, 90
                dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9
                dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90

tab_dct32_2:    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
                dw -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90
                dw -90, -87, -80, -70, -57, -43, -25, -9,  9, 25, 43, 57, 70, 80, 87, 90
                dw 13, 38, 61, 78, 88, 90, 85, 73, 54, 31,  4, -22, -46, -67, -82, -90
                dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
                dw -22, -61, -85, -90, -73, -38,  4, 46, 78, 90, 82, 54, 13, -31, -67, -88
                dw -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43,  9, 57, 87
                dw 31, 78, 90, 61,  4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85
                dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
                dw -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82
                dw -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70,  9, 80
                dw 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82,  4, -78
                dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
                dw -54, -85,  4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73
                dw -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90,  9, -87, -43, 70
                dw 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67
                dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
                dw -67, -54, 78, 38, -85, -22, 90,  4, -90, 13, 88, -31, -82, 46, 73, -61
                dw -57, 80, 25, -90,  9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57
                dw 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88,  4, 85, -54
                dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
                dw -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46
                dw -43, 90, -57, -25, 87, -70, -9, 80, -80,  9, 70, -87, 25, 57, -90, 43
                dw 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67,  4, -73, 88, -38
                dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
                dw -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31
                dw -25, 70, -90, 80, -43, -9, 57, -87, 87, -57,  9, 43, -80, 90, -70, 25
                dw 88, -67, 31, 13, -54, 82, -90, 78, -46,  4, 38, -73, 90, -85, 61, -22
                dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
                dw -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13
                dw -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25,  9
                dw 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4

avx2_idct8_1:   times 4 dw 64, 83, 64, 36
                times 4 dw 64, 36, -64, -83
                times 4 dw 64, -36, -64, 83
                times 4 dw 64, -83, 64, -36

avx2_idct8_2:   times 4 dw 89, 75, 50, 18
                times 4 dw 75, -18, -89, -50
                times 4 dw 50, -89, 18, 75
                times 4 dw 18, -50, 75, -89

idct8_shuf1:    dd 0, 2, 4, 6, 1, 3, 5, 7

const idct8_shuf2,    times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15

idct8_shuf3:    times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3

tab_idct16_1:   dw 90, 87, 80, 70, 57, 43, 25, 9
                dw 87, 57, 9, -43, -80, -90, -70, -25
                dw 80, 9, -70, -87, -25, 57, 90, 43
                dw 70, -43, -87, 9, 90, 25, -80, -57
                dw 57, -80, -25, 90, -9, -87, 43, 70
                dw 43, -90, 57, 25, -87, 70, 9, -80
                dw 25, -70, 90, -80, 43, 9, -57, 87
                dw 9, -25, 43, -57, 70, -80, 87, -90

tab_idct16_2:   dw 64, 89, 83, 75, 64, 50, 36, 18
                dw 64, 75, 36, -18, -64, -89, -83, -50
                dw 64, 50, -36, -89, -64, 18, 83, 75
                dw 64, 18, -83, -50, 64, 75, -36, -89
                dw 64, -18, -83, 50, 64, -75, -36, 89
                dw 64, -50, -36, 89, -64, -18, 83, -75
                dw 64, -75, 36, 18, -64, 89, -83, 50
                dw 64, -89, 83, -75, 64, -50, 36, -18

idct16_shuff:   dd 0, 4, 2, 6, 1, 5, 3, 7

idct16_shuff1:  dd 2, 6, 0, 4, 3, 7, 1, 5

tab_idct32_1:   dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
                dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
                dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
                dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
                dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38
                dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
                dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
                dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61
                dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
                dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
                dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78
                dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
                dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
                dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
                dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90
                dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90


tab_idct32_2:   dw 64, 89, 83, 75, 64, 50, 36, 18
                dw 64, 75, 36, -18, -64, -89, -83, -50
                dw 64, 50, -36, -89, -64, 18, 83, 75
                dw 64, 18, -83, -50, 64, 75, -36, -89
                dw 64, -18, -83, 50, 64, -75, -36, 89
                dw 64, -50, -36, 89, -64, -18, 83, -75
                dw 64, -75, 36, 18, -64, 89, -83, 50
                dw 64, -89, 83, -75, 64, -50, 36, -18


tab_idct32_3:   dw 90, 87, 80, 70, 57, 43, 25, 9
                dw 87, 57, 9, -43, -80, -90, -70, -25
                dw 80, 9, -70, -87, -25, 57, 90, 43
                dw 70, -43, -87, 9, 90, 25, -80, -57
                dw 57, -80, -25, 90, -9, -87, 43, 70
                dw 43, -90, 57, 25, -87, 70, 9, -80
                dw 25, -70, 90, -80, 43, 9, -57, 87
                dw 9, -25, 43, -57, 70, -80, 87, -90

tab_idct32_4:   dw 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9
                dw 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25
                dw 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43
                dw 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57
                dw 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70
                dw 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80
                dw 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87
                dw 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90
                dw 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90
                dw 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87
                dw 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80
                dw 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70
                dw 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57
                dw 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43
                dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25
                dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9

avx2_dct4:      dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
                dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83

avx2_idct4_1:   dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
                dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36 ,-83, 36, -83

avx2_idct4_2:   dw 64, 64, 64, -64, 83, 36, 36, -83

const idct4_shuf1,    times 2 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15

idct4_shuf2:    times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11

tab_dct4:       times 4 dw 64, 64
                times 4 dw 83, 36
                times 4 dw 64, -64
                times 4 dw 36, -83

dct4_shuf:      db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13

tab_dst4:       times 2 dw 29, 55, 74, 84
                times 2 dw 74, 74,  0, -74
                times 2 dw 84, -29, -74, 55
                times 2 dw 55, -84, 74, -29

pw_dst4_tab:    times 4 dw 29,  55,  74,  84
                times 4 dw 74,  74,   0, -74
                times 4 dw 84, -29, -74,  55
                times 4 dw 55, -84,  74, -29

tab_idst4:      times 4 dw 29, +84
                times 4 dw +74, +55
                times 4 dw 55, -29
                times 4 dw +74, -84
                times 4 dw 74, -74
                times 4 dw 0, +74
                times 4 dw 84, +55
                times 4 dw -74, -29

pw_idst4_tab:   times 4 dw  29,  84
                times 4 dw  55, -29
                times 4 dw  74,  55
                times 4 dw  74, -84
                times 4 dw  74, -74
                times 4 dw  84,  55
                times 4 dw  0,   74
                times 4 dw -74, -29
pb_idst4_shuf:  times 2 db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15

tab_dct8_1:     times 2 dw 89, 50, 75, 18
                times 2 dw 75, -89, -18, -50
                times 2 dw 50, 18, -89, 75
                times 2 dw 18, 75, -50, -89

tab_dct8_2:     times 2 dd 83, 36
                times 2 dd 36, 83
                times 1 dd 89, 75, 50, 18
                times 1 dd 75, -18, -89, -50
                times 1 dd 50, -89, 18, 75
                times 1 dd 18, -50, 75, -89

tab_idct8_3:    times 4 dw 89, 75
                times 4 dw 50, 18
                times 4 dw 75, -18
                times 4 dw -89, -50
                times 4 dw 50, -89
                times 4 dw 18, 75
                times 4 dw 18, -50
                times 4 dw 75, -89

pb_unpackhlw1:  db 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15

pb_idct8even:   db 0, 1, 8, 9, 4, 5, 12, 13, 0, 1, 8, 9, 4, 5, 12, 13

tab_idct8_1:    times 1 dw 64, -64, 36, -83, 64, 64, 83, 36

tab_idct8_2:    times 1 dw 89, 75, 50, 18, 75, -18, -89, -50
                times 1 dw 50, -89, 18, 75, 18, -50, 75, -89

pb_idct8odd:    db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15

SECTION .text
cextern pd_1
cextern pd_2
cextern pd_4
cextern pd_8
cextern pd_16
cextern pd_32
cextern pd_64
cextern pd_128
cextern pd_256
cextern pd_512
cextern pd_1024
cextern pd_2048
cextern pw_ppppmmmm
cextern trans8_shuf


%if BIT_DEPTH == 12
    %define     DCT4_SHIFT          5
    %define     DCT4_ROUND          16
    %define    IDCT_SHIFT           8
    %define    IDCT_ROUND           128
    %define     DST4_SHIFT          5
    %define     DST4_ROUND          16
    %define     DCT8_SHIFT1         6
    %define     DCT8_ROUND1         32
%elif BIT_DEPTH == 10
    %define     DCT4_SHIFT          3
    %define     DCT4_ROUND          4
    %define    IDCT_SHIFT           10
    %define    IDCT_ROUND           512
    %define     DST4_SHIFT          3
    %define     DST4_ROUND          4
    %define     DCT8_SHIFT1         4
    %define     DCT8_ROUND1         8
%elif BIT_DEPTH == 8
    %define     DCT4_SHIFT          1
    %define     DCT4_ROUND          1
    %define    IDCT_SHIFT           12
    %define    IDCT_ROUND           2048
    %define     DST4_SHIFT          1
    %define     DST4_ROUND          1
    %define     DCT8_SHIFT1         2
    %define     DCT8_ROUND1         2
%else
    %error Unsupported BIT_DEPTH!
%endif

%define         DCT8_ROUND2         256
%define         DCT8_SHIFT2         9

;------------------------------------------------------
;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
;------------------------------------------------------
INIT_XMM sse2
cglobal dct4, 3, 4, 8
    mova        m7, [pd_ %+ DCT4_ROUND]
    add         r2d, r2d
    lea         r3, [tab_dct4]

    mova        m4, [r3 + 0 * 16]
    mova        m5, [r3 + 1 * 16]
    mova        m6, [r3 + 2 * 16]
    movh        m0, [r0 + 0 * r2]
    movh        m1, [r0 + 1 * r2]
    punpcklqdq  m0, m1
    pshufd      m0, m0, 0xD8
    pshufhw     m0, m0, 0xB1

    lea         r0, [r0 + 2 * r2]
    movh        m1, [r0]
    movh        m2, [r0 + r2]
    punpcklqdq  m1, m2
    pshufd      m1, m1, 0xD8
    pshufhw     m1, m1, 0xB1

    punpcklqdq  m2, m0, m1
    punpckhqdq  m0, m1

    paddw       m1, m2, m0
    psubw       m2, m0
    pmaddwd     m0, m1, m4
    paddd       m0, m7
    psrad       m0, DCT4_SHIFT
    pmaddwd     m3, m2, m5
    paddd       m3, m7
    psrad       m3, DCT4_SHIFT
    packssdw    m0, m3
    pshufd      m0, m0, 0xD8
    pshufhw     m0, m0, 0xB1
    pmaddwd     m1, m6
    paddd       m1, m7
    psrad       m1, DCT4_SHIFT
    pmaddwd     m2, [r3 + 3 * 16]
    paddd       m2, m7
    psrad       m2, DCT4_SHIFT
    packssdw    m1, m2
    pshufd      m1, m1, 0xD8
    pshufhw     m1, m1, 0xB1

    punpcklqdq  m2, m0, m1
    punpckhqdq  m0, m1

    mova        m7, [pd_128]

    pmaddwd     m1, m2, m4
    pmaddwd     m3, m0, m4
    paddd       m1, m3
    paddd       m1, m7
    psrad       m1, 8

    pmaddwd     m4, m2, m5
    pmaddwd     m3, m0, m5
    psubd       m4, m3
    paddd       m4, m7
    psrad       m4, 8
    packssdw    m1, m4
    movu        [r1 + 0 * 16], m1

    pmaddwd     m1, m2, m6
    pmaddwd     m3, m0, m6
    paddd       m1, m3
    paddd       m1, m7
    psrad       m1, 8

    pmaddwd     m2, [r3 + 3 * 16]
    pmaddwd     m0, [r3 + 3 * 16]
    psubd       m2, m0
    paddd       m2, m7
    psrad       m2, 8
    packssdw    m1, m2
    movu        [r1 + 1 * 16], m1
    RET

; DCT 4x4
;
; Input parameters:
; - r0:     source
; - r1:     destination
; - r2:     source stride
INIT_YMM avx2
cglobal dct4, 3, 4, 8, src, dst, srcStride
    vbroadcasti128  m7, [pd_ %+ DCT4_ROUND]
    add             r2d, r2d
    lea             r3, [avx2_dct4]

    vbroadcasti128  m4, [dct4_shuf]
    mova            m5, [r3]
    mova            m6, [r3 + 32]
    movq            xm0, [r0]
    movhps          xm0, [r0 + r2]
    lea             r0, [r0 + 2 * r2]
    movq            xm1, [r0]
    movhps          xm1, [r0 + r2]

    vinserti128     m0, m0, xm1, 1
    pshufb          m0, m4
    vpermq          m1, m0, 11011101b
    vpermq          m0, m0, 10001000b
    paddw           m2, m0, m1
    psubw           m0, m1

    pmaddwd         m2, m5
    paddd           m2, m7
    psrad           m2, DCT4_SHIFT

    pmaddwd         m0, m6
    paddd           m0, m7
    psrad           m0, DCT4_SHIFT

    packssdw        m2, m0
    pshufb          m2, m4
    vpermq          m1, m2, 11011101b
    vpermq          m2, m2, 10001000b
    vbroadcasti128  m7, [pd_128]

    pmaddwd         m0, m2, m5
    pmaddwd         m3, m1, m5
    paddd           m3, m0
    paddd           m3, m7
    psrad           m3, 8

    pmaddwd         m2, m6
    pmaddwd         m1, m6
    psubd           m2, m1
    paddd           m2, m7
    psrad           m2, 8

    packssdw        m3, m2
    movu            [r1], m3
    RET

;-------------------------------------------------------
;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_XMM sse2
cglobal idct4, 3, 4, 6
    add         r2d, r2d
    lea         r3, [tab_dct4]

    movu        m0, [r0 + 0 * 16]
    movu        m1, [r0 + 1 * 16]

    punpcklwd   m2, m0, m1
    pmaddwd     m3, m2, [r3 + 0 * 16]       ; m3 = E1
    paddd       m3, [pd_64]

    pmaddwd     m2, [r3 + 2 * 16]           ; m2 = E2
    paddd       m2, [pd_64]

    punpckhwd   m0, m1
    pmaddwd     m1, m0, [r3 + 1 * 16]       ; m1 = O1
    pmaddwd     m0, [r3 + 3 * 16]           ; m0 = O2

    paddd       m4, m3, m1
    psrad       m4, 7                       ; m4 = m128iA
    paddd       m5, m2, m0
    psrad       m5, 7
    packssdw    m4, m5                      ; m4 = m128iA

    psubd       m2, m0
    psrad       m2, 7
    psubd       m3, m1
    psrad       m3, 7
    packssdw    m2, m3                      ; m2 = m128iD

    punpcklwd   m1, m4, m2                  ; m1 = S0
    punpckhwd   m4, m2                      ; m4 = S8

    punpcklwd   m0, m1, m4                  ; m0 = m128iA
    punpckhwd   m1, m4                      ; m1 = m128iD

    punpcklwd   m2, m0, m1
    pmaddwd     m3, m2, [r3 + 0 * 16]
    paddd       m3, [pd_ %+ IDCT_ROUND]     ; m3 = E1

    pmaddwd     m2, [r3 + 2 * 16]
    paddd       m2, [pd_ %+ IDCT_ROUND]     ; m2 = E2

    punpckhwd   m0, m1
    pmaddwd     m1, m0, [r3 + 1 * 16]       ; m1 = O1
    pmaddwd     m0, [r3 + 3 * 16]           ; m0 = O2

    paddd       m4, m3, m1
    psrad       m4, IDCT_SHIFT              ; m4 = m128iA
    paddd       m5, m2, m0
    psrad       m5, IDCT_SHIFT
    packssdw    m4, m5                      ; m4 = m128iA

    psubd       m2, m0
    psrad       m2, IDCT_SHIFT
    psubd       m3, m1
    psrad       m3, IDCT_SHIFT
    packssdw    m2, m3                      ; m2 = m128iD

    punpcklwd   m1, m4, m2
    punpckhwd   m4, m2

    punpcklwd   m0, m1, m4
    movlps      [r1 + 0 * r2], m0
    movhps      [r1 + 1 * r2], m0

    punpckhwd   m1, m4
    movlps      [r1 + 2 * r2], m1
    lea         r1, [r1 + 2 * r2]
    movhps      [r1 + r2], m1
    RET

;------------------------------------------------------
;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
;------------------------------------------------------
INIT_XMM sse2
%if ARCH_X86_64
cglobal dst4, 3, 4, 8+4
  %define       coef0   m8
  %define       coef1   m9
  %define       coef2   m10
  %define       coef3   m11
%else ; ARCH_X86_64 = 0
cglobal dst4, 3, 4, 8
  %define       coef0   [r3 + 0 * 16]
  %define       coef1   [r3 + 1 * 16]
  %define       coef2   [r3 + 2 * 16]
  %define       coef3   [r3 + 3 * 16]
%endif ; ARCH_X86_64

    mova        m5, [pd_ %+ DST4_ROUND]
    add         r2d, r2d
    lea         r3, [tab_dst4]
%if ARCH_X86_64
    mova        coef0, [r3 + 0 * 16]
    mova        coef1, [r3 + 1 * 16]
    mova        coef2, [r3 + 2 * 16]
    mova        coef3, [r3 + 3 * 16]
%endif
    movh        m0, [r0 + 0 * r2]            ; load
    movhps      m0, [r0 + 1 * r2]
    lea         r0, [r0 + 2 * r2]
    movh        m1, [r0]
    movhps      m1, [r0 + r2]
    pmaddwd     m2, m0, coef0                ; DST1
    pmaddwd     m3, m1, coef0
    pshufd      m6, m2, q2301
    pshufd      m7, m3, q2301
    paddd       m2, m6
    paddd       m3, m7
    pshufd      m2, m2, q3120
    pshufd      m3, m3, q3120
    punpcklqdq  m2, m3
    paddd       m2, m5
    psrad       m2, DST4_SHIFT
    pmaddwd     m3, m0, coef1
    pmaddwd     m4, m1, coef1
    pshufd      m6, m4, q2301
    pshufd      m7, m3, q2301
    paddd       m4, m6
    paddd       m3, m7
    pshufd      m4, m4, q3120
    pshufd      m3, m3, q3120
    punpcklqdq  m3, m4
    paddd       m3, m5
    psrad       m3, DST4_SHIFT
    packssdw    m2, m3                       ; m2 = T70
    pmaddwd     m3, m0, coef2
    pmaddwd     m4, m1, coef2
    pshufd      m6, m4, q2301
    pshufd      m7, m3, q2301
    paddd       m4, m6
    paddd       m3, m7
    pshufd      m4, m4, q3120
    pshufd      m3, m3, q3120
    punpcklqdq  m3, m4
    paddd       m3, m5
    psrad       m3, DST4_SHIFT
    pmaddwd     m0, coef3
    pmaddwd     m1, coef3
    pshufd      m6, m0, q2301
    pshufd      m7, m1, q2301
    paddd       m0, m6
    paddd       m1, m7
    pshufd      m0, m0, q3120
    pshufd      m1, m1, q3120
    punpcklqdq  m0, m1
    paddd       m0, m5
    psrad       m0, DST4_SHIFT
    packssdw    m3, m0                       ; m3 = T71
    mova        m5, [pd_128]

    pmaddwd     m0, m2, coef0                ; DST2
    pmaddwd     m1, m3, coef0
    pshufd      m6, m0, q2301
    pshufd      m7, m1, q2301
    paddd       m0, m6
    paddd       m1, m7
    pshufd      m0, m0, q3120
    pshufd      m1, m1, q3120
    punpcklqdq  m0, m1
    paddd       m0, m5
    psrad       m0, 8

    pmaddwd     m4, m2, coef1
    pmaddwd     m1, m3, coef1
    pshufd      m6, m4, q2301
    pshufd      m7, m1, q2301
    paddd       m4, m6
    paddd       m1, m7
    pshufd      m4, m4, q3120
    pshufd      m1, m1, q3120
    punpcklqdq  m4, m1
    paddd       m4, m5
    psrad       m4, 8
    packssdw    m0, m4
    movu        [r1 + 0 * 16], m0

    pmaddwd     m0, m2, coef2
    pmaddwd     m1, m3, coef2
    pshufd      m6, m0, q2301
    pshufd      m7, m1, q2301
    paddd       m0, m6
    paddd       m1, m7
    pshufd      m0, m0, q3120
    pshufd      m1, m1, q3120
    punpcklqdq  m0, m1
    paddd       m0, m5
    psrad       m0, 8

    pmaddwd     m2, coef3
    pmaddwd     m3, coef3
    pshufd      m6, m2, q2301
    pshufd      m7, m3, q2301
    paddd       m2, m6
    paddd       m3, m7
    pshufd      m2, m2, q3120
    pshufd      m3, m3, q3120
    punpcklqdq  m2, m3
    paddd       m2, m5
    psrad       m2, 8
    packssdw    m0, m2
    movu        [r1 + 1 * 16], m0
    RET

;------------------------------------------------------
;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
;------------------------------------------------------
INIT_XMM ssse3
%if ARCH_X86_64
cglobal dst4, 3, 4, 8+2
  %define       coef2   m8
  %define       coef3   m9
%else ; ARCH_X86_64 = 0
cglobal dst4, 3, 4, 8
  %define       coef2   [r3 + 2 * 16]
  %define       coef3   [r3 + 3 * 16]
%endif ; ARCH_X86_64
%define         coef0   m6
%define         coef1   m7

    mova        m5, [pd_ %+ DST4_ROUND]
    add         r2d, r2d
    lea         r3, [tab_dst4]
    mova        coef0, [r3 + 0 * 16]
    mova        coef1, [r3 + 1 * 16]
%if ARCH_X86_64
    mova        coef2, [r3 + 2 * 16]
    mova        coef3, [r3 + 3 * 16]
%endif
    movh        m0, [r0 + 0 * r2]            ; load
    movh        m1, [r0 + 1 * r2]
    punpcklqdq  m0, m1
    lea         r0, [r0 + 2 * r2]
    movh        m1, [r0]
    movh        m2, [r0 + r2]
    punpcklqdq  m1, m2
    pmaddwd     m2, m0, coef0                ; DST1
    pmaddwd     m3, m1, coef0
    phaddd      m2, m3
    paddd       m2, m5
    psrad       m2, DST4_SHIFT
    pmaddwd     m3, m0, coef1
    pmaddwd     m4, m1, coef1
    phaddd      m3, m4
    paddd       m3, m5
    psrad       m3, DST4_SHIFT
    packssdw    m2, m3                       ; m2 = T70
    pmaddwd     m3, m0, coef2
    pmaddwd     m4, m1, coef2
    phaddd      m3, m4
    paddd       m3, m5
    psrad       m3, DST4_SHIFT
    pmaddwd     m0, coef3
    pmaddwd     m1, coef3
    phaddd      m0, m1
    paddd       m0, m5
    psrad       m0, DST4_SHIFT
    packssdw    m3, m0                       ; m3 = T71
    mova        m5, [pd_128]

    pmaddwd     m0, m2, coef0                ; DST2
    pmaddwd     m1, m3, coef0
    phaddd      m0, m1
    paddd       m0, m5
    psrad       m0, 8

    pmaddwd     m4, m2, coef1
    pmaddwd     m1, m3, coef1
    phaddd      m4, m1
    paddd       m4, m5
    psrad       m4, 8
    packssdw    m0, m4
    movu        [r1 + 0 * 16], m0

    pmaddwd     m0, m2, coef2
    pmaddwd     m1, m3, coef2
    phaddd      m0, m1
    paddd       m0, m5
    psrad       m0, 8

    pmaddwd     m2, coef3
    pmaddwd     m3, coef3
    phaddd      m2, m3
    paddd       m2, m5
    psrad       m2, 8
    packssdw    m0, m2
    movu        [r1 + 1 * 16], m0
    RET

;------------------------------------------------------------------
;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
;------------------------------------------------------------------
INIT_YMM avx2
cglobal dst4, 3, 4, 6
    vbroadcasti128 m5, [pd_ %+ DST4_ROUND]
    mova        m4, [trans8_shuf]
    add         r2d, r2d
    lea         r3, [pw_dst4_tab]

    movq        xm0, [r0 + 0 * r2]
    movhps      xm0, [r0 + 1 * r2]
    lea         r0, [r0 + 2 * r2]
    movq        xm1, [r0]
    movhps      xm1, [r0 + r2]

    vinserti128 m0, m0, xm1, 1          ; m0 = src[0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]

    pmaddwd     m2, m0, [r3 + 0 * 32]
    pmaddwd     m1, m0, [r3 + 1 * 32]
    phaddd      m2, m1
    paddd       m2, m5
    psrad       m2, DST4_SHIFT
    pmaddwd     m3, m0, [r3 + 2 * 32]
    pmaddwd     m1, m0, [r3 + 3 * 32]
    phaddd      m3, m1
    paddd       m3, m5
    psrad       m3, DST4_SHIFT
    packssdw    m2, m3
    vpermd      m2, m4, m2

    vpbroadcastd m5, [pd_128]
    pmaddwd     m0, m2, [r3 + 0 * 32]
    pmaddwd     m1, m2, [r3 + 1 * 32]
    phaddd      m0, m1
    paddd       m0, m5
    psrad       m0, 8
    pmaddwd     m3, m2, [r3 + 2 * 32]
    pmaddwd     m2, m2, [r3 + 3 * 32]
    phaddd      m3, m2
    paddd       m3, m5
    psrad       m3, 8
    packssdw    m0, m3
    vpermd      m0, m4, m0
    movu        [r1], m0
    RET

;-------------------------------------------------------
;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_XMM sse2
cglobal idst4, 3, 4, 7
    mova        m6, [pd_ %+ IDCT_ROUND]
    add         r2d, r2d
    lea         r3, [tab_idst4]
    mova        m5, [pd_64]

    movu        m0, [r0 + 0 * 16]
    movu        m1, [r0 + 1 * 16]

    punpcklwd   m2, m0, m1                  ; m2 = m128iAC
    punpckhwd   m0, m1                      ; m0 = m128iBD

    pmaddwd     m1, m2, [r3 + 0 * 16]
    pmaddwd     m3, m0, [r3 + 1 * 16]
    paddd       m1, m3
    paddd       m1, m5
    psrad       m1, 7                       ; m1 = S0

    pmaddwd     m3, m2, [r3 + 2 * 16]
    pmaddwd     m4, m0, [r3 + 3 * 16]
    paddd       m3, m4
    paddd       m3, m5
    psrad       m3, 7                       ; m3 = S8
    packssdw    m1, m3                      ; m1 = m128iA

    pmaddwd     m3, m2, [r3 + 4 * 16]
    pmaddwd     m4, m0, [r3 + 5 * 16]
    paddd       m3, m4
    paddd       m3, m5
    psrad       m3, 7                       ; m3 = S0

    pmaddwd     m2, [r3 + 6 * 16]
    pmaddwd     m0, [r3 + 7 * 16]
    paddd       m2, m0
    paddd       m2, m5
    psrad       m2, 7                       ; m2 = S8
    packssdw    m3, m2                      ; m3 = m128iD

    punpcklwd   m0, m1, m3
    punpckhwd   m1, m3

    punpcklwd   m2, m0, m1
    punpckhwd   m0, m1
    punpcklwd   m1, m2, m0
    punpckhwd   m2, m0
    pmaddwd     m0, m1, [r3 + 0 * 16]
    pmaddwd     m3, m2, [r3 + 1 * 16]
    paddd       m0, m3
    paddd       m0, m6
    psrad       m0, IDCT_SHIFT              ; m0 = S0
    pmaddwd     m3, m1, [r3 + 2 * 16]
    pmaddwd     m4, m2, [r3 + 3 * 16]
    paddd       m3, m4
    paddd       m3, m6
    psrad       m3, IDCT_SHIFT              ; m3 = S8
    packssdw    m0, m3                      ; m0 = m128iA
    pmaddwd     m3, m1, [r3 + 4 * 16]
    pmaddwd     m4, m2, [r3 + 5 * 16]
    paddd       m3, m4
    paddd       m3, m6
    psrad       m3, IDCT_SHIFT              ; m3 = S0
    pmaddwd     m1, [r3 + 6 * 16]
    pmaddwd     m2, [r3 + 7 * 16]
    paddd       m1, m2
    paddd       m1, m6
    psrad       m1, IDCT_SHIFT              ; m1 = S8
    packssdw    m3, m1                      ; m3 = m128iD
    punpcklwd   m1, m0, m3
    punpckhwd   m0, m3

    punpcklwd   m2, m1, m0
    movlps      [r1 + 0 * r2], m2
    movhps      [r1 + 1 * r2], m2

    punpckhwd   m1, m0
    movlps      [r1 + 2 * r2], m1
    lea         r1, [r1 + 2 * r2]
    movhps      [r1 + r2], m1
    RET

;-----------------------------------------------------------------
;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-----------------------------------------------------------------
INIT_YMM avx2
cglobal idst4, 3, 4, 6
    vbroadcasti128 m4, [pd_ %+ IDCT_ROUND]
    add         r2d, r2d
    lea         r3, [pw_idst4_tab]

    movu        xm0, [r0 + 0 * 16]
    movu        xm1, [r0 + 1 * 16]

    punpcklwd   m2, m0, m1
    punpckhwd   m0, m1

    vinserti128 m2, m2, xm2, 1
    vinserti128 m0, m0, xm0, 1

    vpbroadcastd m5, [pd_64]
    pmaddwd     m1, m2, [r3 + 0 * 32]
    pmaddwd     m3, m0, [r3 + 1 * 32]
    paddd       m1, m3
    paddd       m1, m5
    psrad       m1, 7
    pmaddwd     m3, m2, [r3 + 2 * 32]
    pmaddwd     m0, [r3 + 3 * 32]
    paddd       m3, m0
    paddd       m3, m5
    psrad       m3, 7

    packssdw    m0, m1, m3
    pshufb      m0, [pb_idst4_shuf]
    vpermq      m1, m0, 11101110b

    punpcklwd   m2, m0, m1
    punpckhwd   m0, m1
    punpcklwd   m1, m2, m0
    punpckhwd   m2, m0

    vpermq      m1, m1, 01000100b
    vpermq      m2, m2, 01000100b

    pmaddwd     m0, m1, [r3 + 0 * 32]
    pmaddwd     m3, m2, [r3 + 1 * 32]
    paddd       m0, m3
    paddd       m0, m4
    psrad       m0, IDCT_SHIFT
    pmaddwd     m3, m1, [r3 + 2 * 32]
    pmaddwd     m2, m2, [r3 + 3 * 32]
    paddd       m3, m2
    paddd       m3, m4
    psrad       m3, IDCT_SHIFT

    packssdw    m0, m3
    pshufb      m1, m0, [pb_idst4_shuf]
    vpermq      m0, m1, 11101110b

    punpcklwd   m2, m1, m0
    movq        [r1 + 0 * r2], xm2
    movhps      [r1 + 1 * r2], xm2

    punpckhwd   m1, m0
    movq        [r1 + 2 * r2], xm1
    lea         r1, [r1 + 2 * r2]
    movhps      [r1 + r2], xm1
    RET

;-------------------------------------------------------
; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
;-------------------------------------------------------
INIT_XMM sse2
cglobal dct8, 3,6,8,0-16*mmsize
    ;------------------------
    ; Stack Mapping(dword)
    ;------------------------
    ; Row0[0-3] Row1[0-3]
    ; ...
    ; Row6[0-3] Row7[0-3]
    ; Row0[0-3] Row7[0-3]
    ; ...
    ; Row6[4-7] Row7[4-7]
    ;------------------------

    add         r2, r2
    lea         r3, [r2 * 3]
    mov         r5, rsp
%assign x 0
%rep 2
    movu        m0, [r0]
    movu        m1, [r0 + r2]
    movu        m2, [r0 + r2 * 2]
    movu        m3, [r0 + r3]

    punpcklwd   m4, m0, m1
    punpckhwd   m0, m1
    punpcklwd   m5, m2, m3
    punpckhwd   m2, m3
    punpckldq   m1, m4, m5          ; m1 = [1 0]
    punpckhdq   m4, m5              ; m4 = [3 2]
    punpckldq   m3, m0, m2
    punpckhdq   m0, m2
    pshufd      m2, m3, 0x4E        ; m2 = [4 5]
    pshufd      m0, m0, 0x4E        ; m0 = [6 7]

    paddw       m3, m1, m0
    psubw       m1, m0              ; m1 = [d1 d0]
    paddw       m0, m4, m2
    psubw       m4, m2              ; m4 = [d3 d2]
    punpcklqdq  m2, m3, m0          ; m2 = [s2 s0]
    punpckhqdq  m3, m0
    pshufd      m3, m3, 0x4E        ; m3 = [s1 s3]

    punpcklwd   m0, m1, m4          ; m0 = [d2/d0]
    punpckhwd   m1, m4              ; m1 = [d3/d1]
    punpckldq   m4, m0, m1          ; m4 = [d3 d1 d2 d0]
    punpckhdq   m0, m1              ; m0 = [d3 d1 d2 d0]

    ; odd
    lea         r4, [tab_dct8_1]
    pmaddwd     m1, m4, [r4 + 0*16]
    pmaddwd     m5, m0, [r4 + 0*16]
    pshufd      m1, m1, 0xD8
    pshufd      m5, m5, 0xD8
    mova        m7, m1
    punpckhqdq  m7, m5
    punpcklqdq  m1, m5
    paddd       m1, m7
    paddd       m1, [pd_ %+ DCT8_ROUND1]
    psrad       m1, DCT8_SHIFT1
  %if x == 1
    pshufd      m1, m1, 0x1B
  %endif
    mova        [r5 + 1*2*mmsize], m1 ; Row 1

    pmaddwd     m1, m4, [r4 + 1*16]
    pmaddwd     m5, m0, [r4 + 1*16]
    pshufd      m1, m1, 0xD8
    pshufd      m5, m5, 0xD8
    mova        m7, m1
    punpckhqdq  m7, m5
    punpcklqdq  m1, m5
    paddd       m1, m7
    paddd       m1, [pd_ %+ DCT8_ROUND1]
    psrad       m1, DCT8_SHIFT1
  %if x == 1
    pshufd      m1, m1, 0x1B
  %endif
    mova        [r5 + 3*2*mmsize], m1 ; Row 3

    pmaddwd     m1, m4, [r4 + 2*16]
    pmaddwd     m5, m0, [r4 + 2*16]
    pshufd      m1, m1, 0xD8
    pshufd      m5, m5, 0xD8
    mova        m7, m1
    punpckhqdq  m7, m5
    punpcklqdq  m1, m5
    paddd       m1, m7
    paddd       m1, [pd_ %+ DCT8_ROUND1]
    psrad       m1, DCT8_SHIFT1
  %if x == 1
    pshufd      m1, m1, 0x1B
  %endif
    mova        [r5 + 5*2*mmsize], m1 ; Row 5

    pmaddwd     m4, [r4 + 3*16]
    pmaddwd     m0, [r4 + 3*16]
    pshufd      m4, m4, 0xD8
    pshufd      m0, m0, 0xD8
    mova        m7, m4
    punpckhqdq  m7, m0
    punpcklqdq  m4, m0
    paddd       m4, m7
    paddd       m4, [pd_ %+ DCT8_ROUND1]
    psrad       m4, DCT8_SHIFT1
  %if x == 1
    pshufd      m4, m4, 0x1B
  %endif
    mova        [r5 + 7*2*mmsize], m4; Row 7

    ; even
    lea         r4, [tab_dct4]
    paddw       m0, m2, m3          ; m0 = [EE1 EE0]
    pshufd      m0, m0, 0xD8
    pshuflw     m0, m0, 0xD8
    pshufhw     m0, m0, 0xD8
    psubw       m2, m3              ; m2 = [EO1 EO0]
    pmullw      m2, [pw_ppppmmmm]
    pshufd      m2, m2, 0xD8
    pshuflw     m2, m2, 0xD8
    pshufhw     m2, m2, 0xD8
    pmaddwd     m3, m0, [r4 + 0*16]
    paddd       m3, [pd_ %+ DCT8_ROUND1]
    psrad       m3, DCT8_SHIFT1
  %if x == 1
    pshufd      m3, m3, 0x1B
  %endif
    mova        [r5 + 0*2*mmsize], m3 ; Row 0
    pmaddwd     m0, [r4 + 2*16]
    paddd       m0, [pd_ %+ DCT8_ROUND1]
    psrad       m0, DCT8_SHIFT1
  %if x == 1
    pshufd      m0, m0, 0x1B
  %endif
    mova        [r5 + 4*2*mmsize], m0 ; Row 4
    pmaddwd     m3, m2, [r4 + 1*16]
    paddd       m3, [pd_ %+ DCT8_ROUND1]
    psrad       m3, DCT8_SHIFT1
  %if x == 1
    pshufd      m3, m3, 0x1B
  %endif
    mova        [r5 + 2*2*mmsize], m3 ; Row 2
    pmaddwd     m2, [r4 + 3*16]
    paddd       m2, [pd_ %+ DCT8_ROUND1]
    psrad       m2, DCT8_SHIFT1
  %if x == 1
    pshufd      m2, m2, 0x1B
  %endif
    mova        [r5 + 6*2*mmsize], m2 ; Row 6

  %if x != 1
    lea         r0, [r0 + r2 * 4]
    add         r5, mmsize
  %endif
%assign x x+1
%endrep

    mov         r0, rsp                 ; r0 = pointer to Low Part
    lea         r4, [tab_dct8_2]

%assign x 0
%rep 4
    mova        m0, [r0 + 0*2*mmsize]     ; [3 2 1 0]
    mova        m1, [r0 + 1*2*mmsize]
    paddd       m2, m0, [r0 + (0*2+1)*mmsize]
    pshufd      m2, m2, 0x9C            ; m2 = [s2 s1 s3 s0]
    paddd       m3, m1, [r0 + (1*2+1)*mmsize]
    pshufd      m3, m3, 0x9C            ; m3 = ^^
    psubd       m0, [r0 + (0*2+1)*mmsize]     ; m0 = [d3 d2 d1 d0]
    psubd       m1, [r0 + (1*2+1)*mmsize]     ; m1 = ^^

    ; even
    pshufd      m4, m2, 0xD8
    pshufd      m3, m3, 0xD8
    mova        m7, m4
    punpckhqdq  m7, m3
    punpcklqdq  m4, m3
    mova        m2, m4
    paddd       m4, m7                  ; m4 = [EE1 EE0 EE1 EE0]
    psubd       m2, m7                  ; m2 = [EO1 EO0 EO1 EO0]

    pslld       m4, 6                   ; m4 = [64*EE1 64*EE0]
    mova        m5, m2
    pmuludq     m5, [r4 + 0*16]
    pshufd      m7, m2, 0xF5
    movu        m6, [r4 + 0*16 + 4]
    pmuludq     m7, m6
    pshufd      m5, m5, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m5, m7                  ; m5 = [36*EO1 83*EO0]
    pshufd      m7, m2, 0xF5
    pmuludq     m2, [r4 + 1*16]
    movu        m6, [r4 + 1*16 + 4]
    pmuludq     m7, m6
    pshufd      m2, m2, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m2, m7                  ; m2 = [83*EO1 36*EO0]

    pshufd      m3, m4, 0xD8
    pshufd      m5, m5, 0xD8
    mova        m7, m3
    punpckhqdq  m7, m5
    punpcklqdq  m3, m5
    paddd       m3, m7                  ; m3 = [Row2 Row0]
    paddd       m3, [pd_ %+ DCT8_ROUND2]
    psrad       m3, DCT8_SHIFT2
    pshufd      m4, m4, 0xD8
    pshufd      m2, m2, 0xD8
    mova        m7, m4
    punpckhqdq  m7, m2
    punpcklqdq  m4, m2
    psubd       m4, m7                  ; m4 = [Row6 Row4]
    paddd       m4, [pd_ %+ DCT8_ROUND2]
    psrad       m4, DCT8_SHIFT2

    packssdw    m3, m3
    movd        [r1 + 0*mmsize], m3
    pshufd      m3, m3, 1
    movd        [r1 + 2*mmsize], m3

    packssdw    m4, m4
    movd        [r1 + 4*mmsize], m4
    pshufd      m4, m4, 1
    movd        [r1 + 6*mmsize], m4

    ; odd
    mova        m2, m0
    pmuludq     m2, [r4 + 2*16]
    pshufd      m7, m0, 0xF5
    movu        m6, [r4 + 2*16 + 4]
    pmuludq     m7, m6
    pshufd      m2, m2, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m2, m7
    mova        m3, m1
    pmuludq     m3, [r4 + 2*16]
    pshufd      m7, m1, 0xF5
    pmuludq     m7, m6
    pshufd      m3, m3, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m3, m7
    mova        m4, m0
    pmuludq     m4, [r4 + 3*16]
    pshufd      m7, m0, 0xF5
    movu        m6, [r4 + 3*16 + 4]
    pmuludq     m7, m6
    pshufd      m4, m4, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m4, m7
    mova        m5, m1
    pmuludq     m5, [r4 + 3*16]
    pshufd      m7, m1, 0xF5
    pmuludq     m7, m6
    pshufd      m5, m5, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m5, m7
    pshufd      m2, m2, 0xD8
    pshufd      m3, m3, 0xD8
    mova        m7, m2
    punpckhqdq  m7, m3
    punpcklqdq  m2, m3
    paddd       m2, m7
    pshufd      m4, m4, 0xD8
    pshufd      m5, m5, 0xD8
    mova        m7, m4
    punpckhqdq  m7, m5
    punpcklqdq  m4, m5
    paddd       m4, m7
    pshufd      m2, m2, 0xD8
    pshufd      m4, m4, 0xD8
    mova        m7, m2
    punpckhqdq  m7, m4
    punpcklqdq  m2, m4
    paddd       m2, m7                  ; m2 = [Row3 Row1]
    paddd       m2, [pd_ %+ DCT8_ROUND2]
    psrad       m2, DCT8_SHIFT2

    packssdw    m2, m2
    movd        [r1 + 1*mmsize], m2
    pshufd      m2, m2, 1
    movd        [r1 + 3*mmsize], m2

    mova        m2, m0
    pmuludq     m2, [r4 + 4*16]
    pshufd      m7, m0, 0xF5
    movu        m6, [r4 + 4*16 + 4]
    pmuludq     m7, m6
    pshufd      m2, m2, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m2, m7
    mova        m3, m1
    pmuludq     m3, [r4 + 4*16]
    pshufd      m7, m1, 0xF5
    pmuludq     m7, m6
    pshufd      m3, m3, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m3, m7
    mova        m4, m0
    pmuludq     m4, [r4 + 5*16]
    pshufd      m7, m0, 0xF5
    movu        m6, [r4 + 5*16 + 4]
    pmuludq     m7, m6
    pshufd      m4, m4, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m4, m7
    mova        m5, m1
    pmuludq     m5, [r4 + 5*16]
    pshufd      m7, m1, 0xF5
    pmuludq     m7, m6
    pshufd      m5, m5, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m5, m7
    pshufd      m2, m2, 0xD8
    pshufd      m3, m3, 0xD8
    mova        m7, m2
    punpckhqdq  m7, m3
    punpcklqdq  m2, m3
    paddd       m2, m7
    pshufd      m4, m4, 0xD8
    pshufd      m5, m5, 0xD8
    mova        m7, m4
    punpckhqdq  m7, m5
    punpcklqdq  m4, m5
    paddd       m4, m7
    pshufd      m2, m2, 0xD8
    pshufd      m4, m4, 0xD8
    mova        m7, m2
    punpckhqdq  m7, m4
    punpcklqdq  m2, m4
    paddd       m2, m7                  ; m2 = [Row7 Row5]
    paddd       m2, [pd_ %+ DCT8_ROUND2]
    psrad       m2, DCT8_SHIFT2

    packssdw    m2, m2
    movd        [r1 + 5*mmsize], m2
    pshufd      m2, m2, 1
    movd        [r1 + 7*mmsize], m2
%if x < 3
    add         r1, mmsize/4
    add         r0, 2*2*mmsize
%endif
%assign x x+1
%endrep

    RET

;-------------------------------------------------------
; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
;-------------------------------------------------------
INIT_XMM sse4
cglobal dct8, 3,6,7,0-16*mmsize
    ;------------------------
    ; Stack Mapping(dword)
    ;------------------------
    ; Row0[0-3] Row1[0-3]
    ; ...
    ; Row6[0-3] Row7[0-3]
    ; Row0[0-3] Row7[0-3]
    ; ...
    ; Row6[4-7] Row7[4-7]
    ;------------------------
    mova        m6, [pd_ %+ DCT8_ROUND1]

    add         r2, r2
    lea         r3, [r2 * 3]
    mov         r5, rsp
%assign x 0
%rep 2
    movu        m0, [r0]
    movu        m1, [r0 + r2]
    movu        m2, [r0 + r2 * 2]
    movu        m3, [r0 + r3]

    punpcklwd   m4, m0, m1
    punpckhwd   m0, m1
    punpcklwd   m5, m2, m3
    punpckhwd   m2, m3
    punpckldq   m1, m4, m5          ; m1 = [1 0]
    punpckhdq   m4, m5              ; m4 = [3 2]
    punpckldq   m3, m0, m2
    punpckhdq   m0, m2
    pshufd      m2, m3, 0x4E        ; m2 = [4 5]
    pshufd      m0, m0, 0x4E        ; m0 = [6 7]

    paddw       m3, m1, m0
    psubw       m1, m0              ; m1 = [d1 d0]
    paddw       m0, m4, m2
    psubw       m4, m2              ; m4 = [d3 d2]
    punpcklqdq  m2, m3, m0          ; m2 = [s2 s0]
    punpckhqdq  m3, m0
    pshufd      m3, m3, 0x4E        ; m3 = [s1 s3]

    punpcklwd   m0, m1, m4          ; m0 = [d2/d0]
    punpckhwd   m1, m4              ; m1 = [d3/d1]
    punpckldq   m4, m0, m1          ; m4 = [d3 d1 d2 d0]
    punpckhdq   m0, m1              ; m0 = [d3 d1 d2 d0]

    ; odd
    lea         r4, [tab_dct8_1]
    pmaddwd     m1, m4, [r4 + 0*16]
    pmaddwd     m5, m0, [r4 + 0*16]
    phaddd      m1, m5
    paddd       m1, m6
    psrad       m1, DCT8_SHIFT1
  %if x == 1
    pshufd      m1, m1, 0x1B
  %endif
    mova        [r5 + 1*2*mmsize], m1 ; Row 1

    pmaddwd     m1, m4, [r4 + 1*16]
    pmaddwd     m5, m0, [r4 + 1*16]
    phaddd      m1, m5
    paddd       m1, m6
    psrad       m1, DCT8_SHIFT1
  %if x == 1
    pshufd      m1, m1, 0x1B
  %endif
    mova        [r5 + 3*2*mmsize], m1 ; Row 3

    pmaddwd     m1, m4, [r4 + 2*16]
    pmaddwd     m5, m0, [r4 + 2*16]
    phaddd      m1, m5
    paddd       m1, m6
    psrad       m1, DCT8_SHIFT1
  %if x == 1
    pshufd      m1, m1, 0x1B
  %endif
    mova        [r5 + 5*2*mmsize], m1 ; Row 5

    pmaddwd     m4, [r4 + 3*16]
    pmaddwd     m0, [r4 + 3*16]
    phaddd      m4, m0
    paddd       m4, m6
    psrad       m4, DCT8_SHIFT1
  %if x == 1
    pshufd      m4, m4, 0x1B
  %endif
    mova        [r5 + 7*2*mmsize], m4; Row 7

    ; even
    lea         r4, [tab_dct4]
    paddw       m0, m2, m3          ; m0 = [EE1 EE0]
    pshufb      m0, [pb_unpackhlw1]
    psubw       m2, m3              ; m2 = [EO1 EO0]
    psignw      m2, [pw_ppppmmmm]
    pshufb      m2, [pb_unpackhlw1]
    pmaddwd     m3, m0, [r4 + 0*16]
    paddd       m3, m6
    psrad       m3, DCT8_SHIFT1
  %if x == 1
    pshufd      m3, m3, 0x1B
  %endif
    mova        [r5 + 0*2*mmsize], m3 ; Row 0
    pmaddwd     m0, [r4 + 2*16]
    paddd       m0, m6
    psrad       m0, DCT8_SHIFT1
  %if x == 1
    pshufd      m0, m0, 0x1B
  %endif
    mova        [r5 + 4*2*mmsize], m0 ; Row 4
    pmaddwd     m3, m2, [r4 + 1*16]
    paddd       m3, m6
    psrad       m3, DCT8_SHIFT1
  %if x == 1
    pshufd      m3, m3, 0x1B
  %endif
    mova        [r5 + 2*2*mmsize], m3 ; Row 2
    pmaddwd     m2, [r4 + 3*16]
    paddd       m2, m6
    psrad       m2, DCT8_SHIFT1
  %if x == 1
    pshufd      m2, m2, 0x1B
  %endif
    mova        [r5 + 6*2*mmsize], m2 ; Row 6

  %if x != 1
    lea         r0, [r0 + r2 * 4]
    add         r5, mmsize
  %endif
%assign x x+1
%endrep

    mov         r2, 2
    mov         r0, rsp                 ; r0 = pointer to Low Part
    lea         r4, [tab_dct8_2]
    mova        m6, [pd_256]

.pass2:
%rep 2
    mova        m0, [r0 + 0*2*mmsize]     ; [3 2 1 0]
    mova        m1, [r0 + 1*2*mmsize]
    paddd       m2, m0, [r0 + (0*2+1)*mmsize]
    pshufd      m2, m2, 0x9C            ; m2 = [s2 s1 s3 s0]
    paddd       m3, m1, [r0 + (1*2+1)*mmsize]
    pshufd      m3, m3, 0x9C            ; m3 = ^^
    psubd       m0, [r0 + (0*2+1)*mmsize]     ; m0 = [d3 d2 d1 d0]
    psubd       m1, [r0 + (1*2+1)*mmsize]     ; m1 = ^^

    ; even
    phaddd      m4, m2, m3              ; m4 = [EE1 EE0 EE1 EE0]
    phsubd      m2, m3                  ; m2 = [EO1 EO0 EO1 EO0]

    pslld       m4, 6                   ; m4 = [64*EE1 64*EE0]
    pmulld      m5, m2, [r4 + 0*16]     ; m5 = [36*EO1 83*EO0]
    pmulld      m2, [r4 + 1*16]         ; m2 = [83*EO1 36*EO0]

    phaddd      m3, m4, m5              ; m3 = [Row2 Row0]
    paddd       m3, m6
    psrad       m3, 9
    phsubd      m4, m2                  ; m4 = [Row6 Row4]
    paddd       m4, m6
    psrad       m4, 9

    packssdw    m3, m3
    movd        [r1 + 0*mmsize], m3
    pshufd      m3, m3, 1
    movd        [r1 + 2*mmsize], m3

    packssdw    m4, m4
    movd        [r1 + 4*mmsize], m4
    pshufd      m4, m4, 1
    movd        [r1 + 6*mmsize], m4

    ; odd
    pmulld      m2, m0, [r4 + 2*16]
    pmulld      m3, m1, [r4 + 2*16]
    pmulld      m4, m0, [r4 + 3*16]
    pmulld      m5, m1, [r4 + 3*16]
    phaddd      m2, m3
    phaddd      m4, m5
    phaddd      m2, m4                  ; m2 = [Row3 Row1]
    paddd       m2, m6
    psrad       m2, 9

    packssdw    m2, m2
    movd        [r1 + 1*mmsize], m2
    pshufd      m2, m2, 1
    movd        [r1 + 3*mmsize], m2

    pmulld      m2, m0, [r4 + 4*16]
    pmulld      m3, m1, [r4 + 4*16]
    pmulld      m4, m0, [r4 + 5*16]
    pmulld      m5, m1, [r4 + 5*16]
    phaddd      m2, m3
    phaddd      m4, m5
    phaddd      m2, m4                  ; m2 = [Row7 Row5]
    paddd       m2, m6
    psrad       m2, 9

    packssdw    m2, m2
    movd        [r1 + 5*mmsize], m2
    pshufd      m2, m2, 1
    movd        [r1 + 7*mmsize], m2

    add         r1, mmsize/4
    add         r0, 2*2*mmsize
%endrep

    dec         r2
    jnz        .pass2
    RET

;-------------------------------------------------------
; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
%if ARCH_X86_64
INIT_XMM sse2
cglobal idct8, 3, 6, 16, 0-5*mmsize
    mova        m9, [r0 + 1 * mmsize]
    mova        m1, [r0 + 3 * mmsize]
    mova        m7, m9
    punpcklwd   m7, m1
    punpckhwd   m9, m1
    mova        m14, [tab_idct8_3]
    mova        m3, m14
    pmaddwd     m14, m7
    pmaddwd     m3, m9
    mova        m0, [r0 + 5 * mmsize]
    mova        m10, [r0 + 7 * mmsize]
    mova        m2, m0
    punpcklwd   m2, m10
    punpckhwd   m0, m10
    mova        m15, [tab_idct8_3 + 1 * mmsize]
    mova        m11, [tab_idct8_3 + 1 * mmsize]
    pmaddwd     m15, m2
    mova        m4, [tab_idct8_3 + 2 * mmsize]
    pmaddwd     m11, m0
    mova        m1, [tab_idct8_3 + 2 * mmsize]
    paddd       m15, m14
    mova        m5, [tab_idct8_3 + 4 * mmsize]
    mova        m12, [tab_idct8_3 + 4 * mmsize]
    paddd       m11, m3
    mova        [rsp + 0 * mmsize], m11
    mova        [rsp + 1 * mmsize], m15
    pmaddwd     m4, m7
    pmaddwd     m1, m9
    mova        m14, [tab_idct8_3 + 3 * mmsize]
    mova        m3, [tab_idct8_3 + 3 * mmsize]
    pmaddwd     m14, m2
    pmaddwd     m3, m0
    paddd       m14, m4
    paddd       m3, m1
    mova        [rsp + 2 * mmsize], m3
    pmaddwd     m5, m9
    pmaddwd     m9, [tab_idct8_3 + 6 * mmsize]
    mova        m6, [tab_idct8_3 + 5 * mmsize]
    pmaddwd     m12, m7
    pmaddwd     m7, [tab_idct8_3 + 6 * mmsize]
    mova        m4, [tab_idct8_3 + 5 * mmsize]
    pmaddwd     m6, m2
    paddd       m6, m12
    pmaddwd     m2, [tab_idct8_3 + 7 * mmsize]
    paddd       m7, m2
    mova        [rsp + 3 * mmsize], m6
    pmaddwd     m4, m0
    pmaddwd     m0, [tab_idct8_3 + 7 * mmsize]
    paddd       m9, m0
    paddd       m5, m4
    mova        m6, [r0 + 0 * mmsize]
    mova        m0, [r0 + 4 * mmsize]
    mova        m4, m6
    punpcklwd   m4, m0
    punpckhwd   m6, m0
    mova        m12, [r0 + 2 * mmsize]
    mova        m0, [r0 + 6 * mmsize]
    mova        m13, m12
    mova        m8, [tab_dct4]
    punpcklwd   m13, m0
    mova        m10, [tab_dct4]
    punpckhwd   m12, m0
    pmaddwd     m8, m4
    mova        m3, m8
    pmaddwd     m4, [tab_dct4 + 2 * mmsize]
    pmaddwd     m10, m6
    mova        m2, [tab_dct4 + 1 * mmsize]
    mova        m1, m10
    pmaddwd     m6, [tab_dct4 + 2 * mmsize]
    mova        m0, [tab_dct4 + 1 * mmsize]
    pmaddwd     m2, m13
    paddd       m3, m2
    psubd       m8, m2
    mova        m2, m6
    pmaddwd     m13, [tab_dct4 + 3 * mmsize]
    pmaddwd     m0, m12
    paddd       m1, m0
    psubd       m10, m0
    mova        m0, m4
    pmaddwd     m12, [tab_dct4 + 3 * mmsize]
    paddd       m3, [pd_64]
    paddd       m1, [pd_64]
    paddd       m8, [pd_64]
    paddd       m10, [pd_64]
    paddd       m0, m13
    paddd       m2, m12
    paddd       m0, [pd_64]
    paddd       m2, [pd_64]
    psubd       m4, m13
    psubd       m6, m12
    paddd       m4, [pd_64]
    paddd       m6, [pd_64]
    mova        m12, m8
    psubd       m8, m7
    psrad       m8, 7
    paddd       m15, m3
    psubd       m3, [rsp + 1 * mmsize]
    psrad       m15, 7
    paddd       m12, m7
    psrad       m12, 7
    paddd       m11, m1
    mova        m13, m14
    psrad       m11, 7
    packssdw    m15, m11
    psubd       m1, [rsp + 0 * mmsize]
    psrad       m1, 7
    mova        m11, [rsp + 2 * mmsize]
    paddd       m14, m0
    psrad       m14, 7
    psubd       m0, m13
    psrad       m0, 7
    paddd       m11, m2
    mova        m13, [rsp + 3 * mmsize]
    psrad       m11, 7
    packssdw    m14, m11
    mova        m11, m6
    psubd       m6, m5
    paddd       m13, m4
    psrad       m13, 7
    psrad       m6, 7
    paddd       m11, m5
    psrad       m11, 7
    packssdw    m13, m11
    mova        m11, m10
    psubd       m4, [rsp + 3 * mmsize]
    psubd       m10, m9
    psrad       m4, 7
    psrad       m10, 7
    packssdw    m4, m6
    packssdw    m8, m10
    paddd       m11, m9
    psrad       m11, 7
    packssdw    m12, m11
    psubd       m2, [rsp + 2 * mmsize]
    mova        m5, m15
    psrad       m2, 7
    packssdw    m0, m2
    mova        m2, m14
    psrad       m3, 7
    packssdw    m3, m1
    mova        m6, m13
    punpcklwd   m5, m8
    punpcklwd   m2, m4
    mova        m1, m12
    punpcklwd   m6, m0
    punpcklwd   m1, m3
    mova        m9, m5
    punpckhwd   m13, m0
    mova        m0, m2
    punpcklwd   m9, m6
    punpckhwd   m5, m6
    punpcklwd   m0, m1
    punpckhwd   m2, m1
    punpckhwd   m15, m8
    mova        m1, m5
    punpckhwd   m14, m4
    punpckhwd   m12, m3
    mova        m6, m9
    punpckhwd   m9, m0
    punpcklwd   m1, m2
    mova        m4, [tab_idct8_3 + 0 * mmsize]
    punpckhwd   m5, m2
    punpcklwd   m6, m0
    mova        m2, m15
    mova        m0, m14
    mova        m7, m9
    punpcklwd   m2, m13
    punpcklwd   m0, m12
    punpcklwd   m7, m5
    punpckhwd   m14, m12
    mova        m10, m2
    punpckhwd   m15, m13
    punpckhwd   m9, m5
    pmaddwd     m4, m7
    mova        m13, m1
    punpckhwd   m2, m0
    punpcklwd   m10, m0
    mova        m0, m15
    punpckhwd   m15, m14
    mova        m12, m1
    mova        m3, [tab_idct8_3 + 0 * mmsize]
    punpcklwd   m0, m14
    pmaddwd     m3, m9
    mova        m11, m2
    punpckhwd   m2, m15
    punpcklwd   m11, m15
    mova        m8, [tab_idct8_3 + 1 * mmsize]
    punpcklwd   m13, m0
    punpckhwd   m12, m0
    pmaddwd     m8, m11
    paddd       m8, m4
    mova        [rsp + 4 * mmsize], m8
    mova        m4, [tab_idct8_3 + 2 * mmsize]
    pmaddwd     m4, m7
    mova        m15, [tab_idct8_3 + 2 * mmsize]
    mova        m5, [tab_idct8_3 + 1 * mmsize]
    pmaddwd     m15, m9
    pmaddwd     m5, m2
    paddd       m5, m3
    mova        [rsp + 3 * mmsize], m5
    mova        m14, [tab_idct8_3 + 3 * mmsize]
    mova        m5, [tab_idct8_3 + 3 * mmsize]
    pmaddwd     m14, m11
    paddd       m14, m4
    mova        [rsp + 2 * mmsize], m14
    pmaddwd     m5, m2
    paddd       m5, m15
    mova        [rsp + 1 * mmsize], m5
    mova        m15, [tab_idct8_3 + 4 * mmsize]
    mova        m5, [tab_idct8_3 + 4 * mmsize]
    pmaddwd     m15, m7
    pmaddwd     m7, [tab_idct8_3 + 6 * mmsize]
    pmaddwd     m5, m9
    pmaddwd     m9, [tab_idct8_3 + 6 * mmsize]
    mova        m4, [tab_idct8_3 + 5 * mmsize]
    pmaddwd     m4, m2
    paddd       m5, m4
    mova        m4, m6
    mova        m8, [tab_idct8_3 + 5 * mmsize]
    punpckhwd   m6, m10
    pmaddwd     m2, [tab_idct8_3 + 7 * mmsize]
    punpcklwd   m4, m10
    paddd       m9, m2
    pmaddwd     m8, m11
    mova        m10, [tab_dct4]
    paddd       m8, m15
    pmaddwd     m11, [tab_idct8_3 + 7 * mmsize]
    paddd       m7, m11
    mova        [rsp + 0 * mmsize], m8
    pmaddwd     m10, m6
    pmaddwd     m6, [tab_dct4 + 2 * mmsize]
    mova        m1, m10
    mova        m8, [tab_dct4]
    mova        m3, [tab_dct4 + 1 * mmsize]
    pmaddwd     m8, m4
    pmaddwd     m4, [tab_dct4 + 2 * mmsize]
    mova        m0, m8
    mova        m2, [tab_dct4 + 1 * mmsize]
    pmaddwd     m3, m13
    psubd       m8, m3
    paddd       m0, m3
    mova        m3, m6
    pmaddwd     m13, [tab_dct4 + 3 * mmsize]
    pmaddwd     m2, m12
    paddd       m1, m2
    psubd       m10, m2
    mova        m2, m4
    pmaddwd     m12, [tab_dct4 + 3 * mmsize]
    mova        m15, [pd_ %+ IDCT_ROUND]
    paddd       m0, m15
    paddd       m1, m15
    paddd       m8, m15
    paddd       m10, m15
    paddd       m2, m13
    paddd       m3, m12
    paddd       m2, m15
    paddd       m3, m15
    psubd       m4, m13
    psubd       m6, m12
    paddd       m4, m15
    paddd       m6, m15
    mova        m15, [rsp + 4 * mmsize]
    mova        m12, m8
    psubd       m8, m7
    psrad       m8, IDCT_SHIFT
    mova        m11, [rsp + 3 * mmsize]
    paddd       m15, m0
    psrad       m15, IDCT_SHIFT
    psubd       m0, [rsp + 4 * mmsize]
    psrad       m0, IDCT_SHIFT
    paddd       m12, m7
    paddd       m11, m1
    mova        m14, [rsp + 2 * mmsize]
    psrad       m11, IDCT_SHIFT
    packssdw    m15, m11
    psubd       m1, [rsp + 3 * mmsize]
    psrad       m1, IDCT_SHIFT
    mova        m11, [rsp + 1 * mmsize]
    paddd       m14, m2
    psrad       m14, IDCT_SHIFT
    packssdw    m0, m1
    psrad       m12, IDCT_SHIFT
    psubd       m2, [rsp + 2 * mmsize]
    paddd       m11, m3
    mova        m13, [rsp + 0 * mmsize]
    psrad       m11, IDCT_SHIFT
    packssdw    m14, m11
    mova        m11, m6
    psubd       m6, m5
    paddd       m13, m4
    psrad       m13, IDCT_SHIFT
    mova        m1, m15
    paddd       m11, m5
    psrad       m11, IDCT_SHIFT
    packssdw    m13, m11
    mova        m11, m10
    psubd       m10, m9
    psrad       m10, IDCT_SHIFT
    packssdw    m8, m10
    psrad       m6, IDCT_SHIFT
    psubd       m4, [rsp + 0 * mmsize]
    paddd       m11, m9
    psrad       m11, IDCT_SHIFT
    packssdw    m12, m11
    punpcklwd   m1, m14
    mova        m5, m13
    psrad       m4, IDCT_SHIFT
    packssdw    m4, m6
    psubd       m3, [rsp + 1 * mmsize]
    psrad       m2, IDCT_SHIFT
    mova        m6, m8
    psrad       m3, IDCT_SHIFT
    punpcklwd   m5, m12
    packssdw    m2, m3
    punpcklwd   m6, m4
    punpckhwd   m8, m4
    mova        m4, m1
    mova        m3, m2
    punpckhdq   m1, m5
    punpckldq   m4, m5
    punpcklwd   m3, m0
    punpckhwd   m2, m0
    mova        m0, m6
    lea         r2, [r2 + r2]
    lea         r4, [r2 + r2]
    lea         r3, [r4 + r2]
    lea         r4, [r4 + r3]
    lea         r0, [r4 + r2 * 2]
    movq        [r1], m4
    punpckhwd   m15, m14
    movhps      [r1 + r2], m4
    punpckhdq   m0, m3
    movq        [r1 + r2 * 2], m1
    punpckhwd   m13, m12
    movhps      [r1 + r3], m1
    mova        m1, m6
    punpckldq   m1, m3
    movq        [r1 + 8], m1
    movhps      [r1 + r2 + 8], m1
    movq        [r1 + r2 * 2 + 8], m0
    movhps      [r1 + r3 + 8], m0
    mova        m0, m15
    punpckhdq   m15, m13
    punpckldq   m0, m13
    movq        [r1 + r2 * 4], m0
    movhps      [r1 + r4], m0
    mova        m0, m8
    punpckhdq   m8, m2
    movq        [r1 + r3 * 2], m15
    punpckldq   m0, m2
    movhps      [r1 + r0], m15
    movq        [r1 + r2 * 4 + 8], m0
    movhps      [r1 + r4 + 8], m0
    movq        [r1 + r3 * 2 + 8], m8
    movhps      [r1 + r0 + 8], m8
    RET
%endif

;-------------------------------------------------------
; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_XMM ssse3
cglobal patial_butterfly_inverse_internal_pass1
    movh        m0, [r0]
    movhps      m0, [r0 + 2 * 16]
    movh        m1, [r0 + 4 * 16]
    movhps      m1, [r0 + 6 * 16]

    punpckhwd   m2, m0, m1                  ; [2 6]
    punpcklwd   m0, m1                      ; [0 4]
    pmaddwd     m1, m0, [r6]                ; EE[0]
    pmaddwd     m0, [r6 + 32]               ; EE[1]
    pmaddwd     m3, m2, [r6 + 16]           ; EO[0]
    pmaddwd     m2, [r6 + 48]               ; EO[1]

    paddd       m4, m1, m3                  ; E[0]
    psubd       m1, m3                      ; E[3]
    paddd       m3, m0, m2                  ; E[1]
    psubd       m0, m2                      ; E[2]

    ;E[K] = E[k] + add
    mova        m5, [pd_64]
    paddd       m0, m5
    paddd       m1, m5
    paddd       m3, m5
    paddd       m4, m5

    movh        m2, [r0 + 16]
    movhps      m2, [r0 + 5 * 16]
    movh        m5, [r0 + 3 * 16]
    movhps      m5, [r0 + 7 * 16]
    punpcklwd   m6, m2, m5                  ;[1 3]
    punpckhwd   m2, m5                      ;[5 7]

    pmaddwd     m5, m6, [r4]
    pmaddwd     m7, m2, [r4 + 16]
    paddd       m5, m7                      ; O[0]

    paddd       m7, m4, m5
    psrad       m7, 7

    psubd       m4, m5
    psrad       m4, 7

    packssdw    m7, m4
    movh        [r5 + 0 * 16], m7
    movhps      [r5 + 7 * 16], m7

    pmaddwd     m5, m6, [r4 + 32]
    pmaddwd     m4, m2, [r4 + 48]
    paddd       m5, m4                      ; O[1]

    paddd       m4, m3, m5
    psrad       m4, 7

    psubd       m3, m5
    psrad       m3, 7

    packssdw    m4, m3
    movh        [r5 + 1 * 16], m4
    movhps      [r5 + 6 * 16], m4

    pmaddwd     m5, m6, [r4 + 64]
    pmaddwd     m4, m2, [r4 + 80]
    paddd       m5, m4                      ; O[2]

    paddd       m4, m0, m5
    psrad       m4, 7

    psubd       m0, m5
    psrad       m0, 7

    packssdw    m4, m0
    movh        [r5 + 2 * 16], m4
    movhps      [r5 + 5 * 16], m4

    pmaddwd     m5, m6, [r4 + 96]
    pmaddwd     m4, m2, [r4 + 112]
    paddd       m5, m4                      ; O[3]

    paddd       m4, m1, m5
    psrad       m4, 7

    psubd       m1, m5
    psrad       m1, 7

    packssdw    m4, m1
    movh        [r5 + 3 * 16], m4
    movhps      [r5 + 4 * 16], m4

    ret

%macro PARTIAL_BUTTERFLY_PROCESS_ROW 1
    pshufb      m4, %1, [pb_idct8even]
    pmaddwd     m4, [tab_idct8_1]
    phsubd      m5, m4
    pshufd      m4, m4, 0x4E
    phaddd      m4, m4
    punpckhqdq  m4, m5                      ;m4 = dd e[ 0 1 2 3]
    paddd       m4, m6

    pshufb      %1, %1, [r6]
    pmaddwd     m5, %1, [r4]
    pmaddwd     %1, [r4 + 16]
    phaddd      m5, %1                      ; m5 = dd O[0, 1, 2, 3]

    paddd       %1, m4, m5
    psrad       %1, IDCT_SHIFT

    psubd       m4, m5
    psrad       m4, IDCT_SHIFT
    pshufd      m4, m4, 0x1B

    packssdw    %1, m4
%endmacro

INIT_XMM ssse3
cglobal patial_butterfly_inverse_internal_pass2
    mova        m0, [r5]
    PARTIAL_BUTTERFLY_PROCESS_ROW m0
    movu        [r1], m0

    mova        m2, [r5 + 16]
    PARTIAL_BUTTERFLY_PROCESS_ROW m2
    movu        [r1 + r2], m2

    mova        m1, [r5 + 32]
    PARTIAL_BUTTERFLY_PROCESS_ROW m1
    movu        [r1 + 2 * r2], m1

    mova        m3, [r5 + 48]
    PARTIAL_BUTTERFLY_PROCESS_ROW m3
    movu        [r1 + r3], m3
    ret

INIT_XMM ssse3
cglobal idct8, 3,7,8 ;,0-16*mmsize
    ; alignment stack to 64-bytes
    mov         r5, rsp
    sub         rsp, 16*mmsize + gprsize
    and         rsp, ~(64-1)
    mov         [rsp + 16*mmsize], r5
    mov         r5, rsp

    lea         r4, [tab_idct8_3]
    lea         r6, [tab_dct4]

    call        patial_butterfly_inverse_internal_pass1

    add         r0, 8
    add         r5, 8

    call        patial_butterfly_inverse_internal_pass1

    mova        m6, [pd_ %+ IDCT_ROUND]
    add         r2, r2
    lea         r3, [r2 * 3]
    lea         r4, [tab_idct8_2]
    lea         r6, [pb_idct8odd]
    sub         r5, 8

    call        patial_butterfly_inverse_internal_pass2

    lea         r1, [r1 + 4 * r2]
    add         r5, 64

    call        patial_butterfly_inverse_internal_pass2

    ; restore origin stack pointer
    mov         rsp, [rsp + 16*mmsize]
    RET


;-----------------------------------------------------------------------------
; void denoise_dct(int16_t* dct, uint32_t* sum, uint16_t* offset, int size)
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal denoise_dct, 4, 4, 6
    pxor     m5,  m5
    shr      r3d, 3
.loop:
    mova     m0, [r0]
    pabsw    m1, m0

    mova     m2, [r1]
    pmovsxwd m3, m1
    paddd    m2, m3
    mova     [r1], m2
    mova     m2, [r1 + 16]
    psrldq   m3, m1, 8
    pmovsxwd m4, m3
    paddd    m2, m4
    mova     [r1 + 16], m2

    movu     m3, [r2]
    psubusw  m1, m3
    pcmpgtw  m4, m1, m5
    pand     m1, m4
    psignw   m1, m0
    mova     [r0], m1
    add      r0, 16
    add      r1, 32
    add      r2, 16
    dec      r3d
    jnz .loop
    RET

INIT_YMM avx2
cglobal denoise_dct, 4, 4, 6
    pxor     m5,  m5
    shr      r3d, 4
.loop:
    movu     m0, [r0]
    pabsw    m1, m0
    movu     m2, [r1]
    pmovsxwd m4, xm1
    paddd    m2, m4
    movu     [r1], m2
    vextracti128 xm4, m1, 1
    movu     m2, [r1 + 32]
    pmovsxwd m3, xm4
    paddd    m2, m3
    movu     [r1 + 32], m2
    movu     m3, [r2]
    psubusw  m1, m3
    pcmpgtw  m4, m1, m5
    pand     m1, m4
    psignw   m1, m0
    movu     [r0], m1
    add      r0, 32
    add      r1, 64
    add      r2, 32
    dec      r3d
    jnz .loop
    RET

%if ARCH_X86_64 == 1
%macro DCT8_PASS_1 4
    vpbroadcastq    m0,                 [r6 + %1]
    pmaddwd         m2,                 m%3, m0
    pmaddwd         m0,                 m%4
    phaddd          m2,                 m0
    paddd           m2,                 m5
    psrad           m2,                 DCT_SHIFT
    packssdw        m2,                 m2
    vpermq          m2,                 m2, 0x08
    mova            [r5 + %2],          xm2
%endmacro

%macro DCT8_PASS_2 2
    vbroadcasti128  m4,                 [r6 + %1]
    pmaddwd         m6,                 m0, m4
    pmaddwd         m7,                 m1, m4
    pmaddwd         m8,                 m2, m4
    pmaddwd         m9,                 m3, m4
    phaddd          m6,                 m7
    phaddd          m8,                 m9
    phaddd          m6,                 m8
    paddd           m6,                 m5
    psrad           m6,                 DCT_SHIFT2

    vbroadcasti128  m4,                 [r6 + %2]
    pmaddwd         m10,                m0, m4
    pmaddwd         m7,                 m1, m4
    pmaddwd         m8,                 m2, m4
    pmaddwd         m9,                 m3, m4
    phaddd          m10,                m7
    phaddd          m8,                 m9
    phaddd          m10,                m8
    paddd           m10,                m5
    psrad           m10,                DCT_SHIFT2

    packssdw        m6,                 m10
    vpermq          m10,                m6, 0xD8

%endmacro

INIT_YMM avx2
cglobal dct8, 3, 7, 11, 0-8*16
%if BIT_DEPTH == 12
    %define         DCT_SHIFT          6
    vbroadcasti128  m5,                [pd_16]
%elif BIT_DEPTH == 10
    %define         DCT_SHIFT          4
    vbroadcasti128  m5,                [pd_8]
%elif BIT_DEPTH == 8
    %define         DCT_SHIFT          2
    vbroadcasti128  m5,                [pd_2]
%else
    %error Unsupported BIT_DEPTH!
%endif
%define             DCT_SHIFT2         9

    add             r2d,               r2d
    lea             r3,                [r2 * 3]
    lea             r4,                [r0 + r2 * 4]
    mov             r5,                rsp
    lea             r6,                [tab_dct8]
    mova            m6,                [dct8_shuf]

    ;pass1
    mova            xm0,               [r0]
    vinserti128     m0,                m0, [r4], 1
    mova            xm1,               [r0 + r2]
    vinserti128     m1,                m1, [r4 + r2], 1
    mova            xm2,               [r0 + r2 * 2]
    vinserti128     m2,                m2, [r4 + r2 * 2], 1
    mova            xm3,               [r0 + r3]
    vinserti128     m3,                m3,  [r4 + r3], 1

    punpcklqdq      m4,                m0, m1
    punpckhqdq      m0,                m1
    punpcklqdq      m1,                m2, m3
    punpckhqdq      m2,                m3

    pshufb          m0,                m6
    pshufb          m2,                m6

    paddw           m3,                m4, m0
    paddw           m7,                m1, m2

    psubw           m4,                m0
    psubw           m1,                m2

    DCT8_PASS_1     0 * 16,             0 * 16, 3, 7
    DCT8_PASS_1     1 * 16,             2 * 16, 4, 1
    DCT8_PASS_1     2 * 16,             4 * 16, 3, 7
    DCT8_PASS_1     3 * 16,             6 * 16, 4, 1
    DCT8_PASS_1     4 * 16,             1 * 16, 3, 7
    DCT8_PASS_1     5 * 16,             3 * 16, 4, 1
    DCT8_PASS_1     6 * 16,             5 * 16, 3, 7
    DCT8_PASS_1     7 * 16,             7 * 16, 4, 1

    ;pass2
    vbroadcasti128  m5,                [pd_256]

    mova            m0,                [r5]
    mova            m1,                [r5 + 32]
    mova            m2,                [r5 + 64]
    mova            m3,                [r5 + 96]

    DCT8_PASS_2     0 * 16, 1 * 16
    movu            [r1],              m10
    DCT8_PASS_2     2 * 16, 3 * 16
    movu            [r1 + 32],         m10
    DCT8_PASS_2     4 * 16, 5 * 16
    movu            [r1 + 64],         m10
    DCT8_PASS_2     6 * 16, 7 * 16
    movu            [r1 + 96],         m10
    RET

%macro DCT16_PASS_1_E 2
    vpbroadcastq    m7,                [r7 + %1]

    pmaddwd         m4,                m0, m7
    pmaddwd         m6,                m2, m7
    phaddd          m4,                m6

    paddd           m4,                m9
    psrad           m4,                DCT_SHIFT

    packssdw        m4,                m4
    vpermq          m4,                m4, 0x08

    mova            [r5 + %2],         xm4
%endmacro

%macro DCT16_PASS_1_O 2
    vbroadcasti128  m7,                [r7 + %1]

    pmaddwd         m10,               m0, m7
    pmaddwd         m11,               m2, m7
    phaddd          m10,               m11                 ; [d0 d0 d1 d1 d4 d4 d5 d5]

    pmaddwd         m11,               m4, m7
    pmaddwd         m12,               m6, m7
    phaddd          m11,               m12                 ; [d2 d2 d3 d3 d6 d6 d7 d7]

    phaddd          m10,               m11                 ; [d0 d1 d2 d3 d4 d5 d6 d7]

    paddd           m10,               m9
    psrad           m10,               DCT_SHIFT

    packssdw        m10,               m10                 ; [w0 w1 w2 w3 - - - - w4 w5 w6 w7 - - - -]
    vpermq          m10,               m10, 0x08

    mova            [r5 + %2],         xm10
%endmacro

%macro DCT16_PASS_2 2
    vbroadcasti128  m8,                [r7 + %1]
    vbroadcasti128  m13,               [r8 + %1]

    pmaddwd         m10,               m0, m8
    pmaddwd         m11,               m1, m13
    paddd           m10,               m11

    pmaddwd         m11,               m2, m8
    pmaddwd         m12,               m3, m13
    paddd           m11,               m12
    phaddd          m10,               m11

    pmaddwd         m11,               m4, m8
    pmaddwd         m12,               m5, m13
    paddd           m11,               m12

    pmaddwd         m12,               m6, m8
    pmaddwd         m13,               m7, m13
    paddd           m12,               m13
    phaddd          m11,               m12

    phaddd          m10,               m11
    paddd           m10,               m9
    psrad           m10,               DCT_SHIFT2


    vbroadcasti128  m8,                [r7 + %2]
    vbroadcasti128  m13,               [r8 + %2]

    pmaddwd         m14,               m0, m8
    pmaddwd         m11,               m1, m13
    paddd           m14,               m11

    pmaddwd         m11,               m2, m8
    pmaddwd         m12,               m3, m13
    paddd           m11,               m12
    phaddd          m14,               m11

    pmaddwd         m11,               m4, m8
    pmaddwd         m12,               m5, m13
    paddd           m11,               m12

    pmaddwd         m12,               m6, m8
    pmaddwd         m13,               m7, m13
    paddd           m12,               m13
    phaddd          m11,               m12

    phaddd          m14,               m11
    paddd           m14,               m9
    psrad           m14,               DCT_SHIFT2

    packssdw        m10,               m14
    vextracti128    xm14,              m10,       1
    movlhps         xm15,              xm10,      xm14
    movhlps         xm14,              xm10
%endmacro
INIT_YMM avx2
cglobal dct16, 3, 9, 16, 0-16*mmsize
%if BIT_DEPTH == 12
    %define         DCT_SHIFT          7
    vbroadcasti128  m9,                [pd_64]
%elif BIT_DEPTH == 10
    %define         DCT_SHIFT          5
    vbroadcasti128  m9,                [pd_16]
%elif BIT_DEPTH == 8
    %define         DCT_SHIFT          3
    vbroadcasti128  m9,                [pd_4]
%else
    %error Unsupported BIT_DEPTH!
%endif
%define             DCT_SHIFT2         10

    add             r2d,               r2d

    mova            m13,               [dct16_shuf1]
    mova            m14,               [dct16_shuf2]
    lea             r7,                [tab_dct16_1 + 8 * 16]
    lea             r8,                [tab_dct16_2 + 8 * 16]
    lea             r3,                [r2 * 3]
    mov             r5,                rsp
    mov             r4d,               2                   ; Each iteration process 8 rows, so 16/8 iterations

.pass1:
    lea             r6,                [r0 + r2 * 4]

    movu            m2,                [r0]
    movu            m1,                [r6]
    vperm2i128      m0,                m2, m1, 0x20        ; [row0lo  row4lo]
    vperm2i128      m1,                m2, m1, 0x31        ; [row0hi  row4hi]

    movu            m4,                [r0 + r2]
    movu            m3,                [r6 + r2]
    vperm2i128      m2,                m4, m3, 0x20        ; [row1lo  row5lo]
    vperm2i128      m3,                m4, m3, 0x31        ; [row1hi  row5hi]

    movu            m6,                [r0 + r2 * 2]
    movu            m5,                [r6 + r2 * 2]
    vperm2i128      m4,                m6, m5, 0x20        ; [row2lo  row6lo]
    vperm2i128      m5,                m6, m5, 0x31        ; [row2hi  row6hi]

    movu            m8,                [r0 + r3]
    movu            m7,                [r6 + r3]
    vperm2i128      m6,                m8, m7, 0x20        ; [row3lo  row7lo]
    vperm2i128      m7,                m8, m7, 0x31        ; [row3hi  row7hi]

    pshufb          m1,                m13
    pshufb          m3,                m13
    pshufb          m5,                m13
    pshufb          m7,                m13

    paddw           m8,                m0, m1              ;E
    psubw           m0,                m1                  ;O

    paddw           m1,                m2, m3              ;E
    psubw           m2,                m3                  ;O

    paddw           m3,                m4, m5              ;E
    psubw           m4,                m5                  ;O

    paddw           m5,                m6, m7              ;E
    psubw           m6,                m7                  ;O

    DCT16_PASS_1_O  -7 * 16,           1 * 32
    DCT16_PASS_1_O  -5 * 16,           3 * 32
    DCT16_PASS_1_O  -3 * 16,           1 * 32 + 16
    DCT16_PASS_1_O  -1 * 16,           3 * 32 + 16
    DCT16_PASS_1_O  1 * 16,            5 * 32
    DCT16_PASS_1_O  3 * 16,            7 * 32
    DCT16_PASS_1_O  5 * 16,            5 * 32 + 16
    DCT16_PASS_1_O  7 * 16,            7 * 32 + 16

    pshufb          m8,                m14
    pshufb          m1,                m14
    phaddw          m0,                m8, m1

    pshufb          m3,                m14
    pshufb          m5,                m14
    phaddw          m2,                m3, m5

    DCT16_PASS_1_E  -8 * 16,           0 * 32
    DCT16_PASS_1_E  -4 * 16,           0 * 32 + 16
    DCT16_PASS_1_E  0 * 16,            4 * 32
    DCT16_PASS_1_E  4 * 16,            4 * 32 + 16

    phsubw          m0,                m8, m1
    phsubw          m2,                m3, m5

    DCT16_PASS_1_E  -6 * 16,           2 * 32
    DCT16_PASS_1_E  -2 * 16,           2 * 32 + 16
    DCT16_PASS_1_E  2 * 16,            6 * 32
    DCT16_PASS_1_E  6 * 16,            6 * 32 + 16

    lea             r0,                [r0 + 8 * r2]
    add             r5,                256

    dec             r4d
    jnz             .pass1

    mov             r5,                rsp
    mov             r4d,               2
    mov             r2d,               32
    lea             r3,                [r2 * 3]
    vbroadcasti128  m9,                [pd_512]

.pass2:
    mova            m0,                [r5 + 0 * 32]        ; [row0lo  row4lo]
    mova            m1,                [r5 + 8 * 32]        ; [row0hi  row4hi]

    mova            m2,                [r5 + 1 * 32]        ; [row1lo  row5lo]
    mova            m3,                [r5 + 9 * 32]        ; [row1hi  row5hi]

    mova            m4,                [r5 + 2 * 32]        ; [row2lo  row6lo]
    mova            m5,                [r5 + 10 * 32]       ; [row2hi  row6hi]

    mova            m6,                [r5 + 3 * 32]        ; [row3lo  row7lo]
    mova            m7,                [r5 + 11 * 32]       ; [row3hi  row7hi]

    DCT16_PASS_2    -8 * 16, -7 * 16
    movu            [r1],              xm15
    movu            [r1 + r2],         xm14

    DCT16_PASS_2    -6 * 16, -5 * 16
    movu            [r1 + r2 * 2],     xm15
    movu            [r1 + r3],         xm14

    lea             r6,                [r1 + r2 * 4]
    DCT16_PASS_2    -4 * 16, -3 * 16
    movu            [r6],              xm15
    movu            [r6 + r2],         xm14

    DCT16_PASS_2    -2 * 16, -1 * 16
    movu            [r6 + r2 * 2],     xm15
    movu            [r6 + r3],         xm14

    lea             r6,                [r6 + r2 * 4]
    DCT16_PASS_2    0 * 16, 1 * 16
    movu            [r6],              xm15
    movu            [r6 + r2],         xm14

    DCT16_PASS_2    2 * 16, 3 * 16
    movu            [r6 + r2 * 2],     xm15
    movu            [r6 + r3],         xm14

    lea             r6,                [r6 + r2 * 4]
    DCT16_PASS_2    4 * 16, 5 * 16
    movu            [r6],              xm15
    movu            [r6 + r2],         xm14

    DCT16_PASS_2    6 * 16, 7 * 16
    movu            [r6 + r2 * 2],     xm15
    movu            [r6 + r3],         xm14

    add             r1,                16
    add             r5,                128

    dec             r4d
    jnz             .pass2
    RET

%macro DCT32_PASS_1 4
    vbroadcasti128  m8,                [r7 + %1]

    pmaddwd         m11,               m%3, m8
    pmaddwd         m12,               m%4, m8
    phaddd          m11,               m12

    vbroadcasti128  m8,                [r7 + %1 + 32]
    vbroadcasti128  m10,               [r7 + %1 + 48]
    pmaddwd         m12,               m5, m8
    pmaddwd         m13,               m6, m10
    phaddd          m12,               m13

    pmaddwd         m13,               m4, m8
    pmaddwd         m14,               m7, m10
    phaddd          m13,               m14

    phaddd          m12,               m13

    phaddd          m11,               m12
    paddd           m11,               m9
    psrad           m11,               DCT_SHIFT

    vpermq          m11,               m11, 0xD8
    packssdw        m11,               m11
    movq            [r5 + %2],         xm11
    vextracti128    xm10,              m11, 1
    movq            [r5 + %2 + 64],    xm10
%endmacro

%macro DCT32_PASS_2 1
    mova            m8,                [r7 + %1]
    mova            m10,               [r8 + %1]
    pmaddwd         m11,               m0, m8
    pmaddwd         m12,               m1, m10
    paddd           m11,               m12

    pmaddwd         m12,               m2, m8
    pmaddwd         m13,               m3, m10
    paddd           m12,               m13

    phaddd          m11,               m12

    pmaddwd         m12,               m4, m8
    pmaddwd         m13,               m5, m10
    paddd           m12,               m13

    pmaddwd         m13,               m6, m8
    pmaddwd         m14,               m7, m10
    paddd           m13,               m14

    phaddd          m12,               m13

    phaddd          m11,               m12
    vextracti128    xm10,              m11, 1
    paddd           xm11,              xm10

    paddd           xm11,               xm9
    psrad           xm11,               DCT_SHIFT2
    packssdw        xm11,               xm11

%endmacro

INIT_YMM avx2
cglobal dct32, 3, 9, 16, 0-64*mmsize
%if BIT_DEPTH == 12
    %define         DCT_SHIFT          8
    vpbroadcastq    m9,                [pd_128]
%elif BIT_DEPTH == 10
    %define         DCT_SHIFT          6
    vpbroadcastq    m9,                [pd_32]
%elif BIT_DEPTH == 8
    %define         DCT_SHIFT          4
    vpbroadcastq    m9,                [pd_8]
%else
    %error Unsupported BIT_DEPTH!
%endif
%define             DCT_SHIFT2         11

    add             r2d,               r2d

    lea             r7,                [tab_dct32_1]
    lea             r8,                [tab_dct32_2]
    lea             r3,                [r2 * 3]
    mov             r5,                rsp
    mov             r4d,               8
    mova            m15,               [dct16_shuf1]

.pass1:
    movu            m2,                [r0]
    movu            m1,                [r0 + 32]
    pshufb          m1,                m15
    vpermq          m1,                m1, 0x4E
    psubw           m7,                m2, m1
    paddw           m2,                m1

    movu            m1,                [r0 + r2 * 2]
    movu            m0,                [r0 + r2 * 2 + 32]
    pshufb          m0,                m15
    vpermq          m0,                m0, 0x4E
    psubw           m8,                m1, m0
    paddw           m1,                m0
    vperm2i128      m0,                m2, m1, 0x20        ; [row0lo  row2lo] for E
    vperm2i128      m3,                m2, m1, 0x31        ; [row0hi  row2hi] for E
    pshufb          m3,                m15
    psubw           m1,                m0, m3
    paddw           m0,                m3

    vperm2i128      m5,                m7, m8, 0x20        ; [row0lo  row2lo] for O
    vperm2i128      m6,                m7, m8, 0x31        ; [row0hi  row2hi] for O


    movu            m4,                [r0 + r2]
    movu            m2,                [r0 + r2 + 32]
    pshufb          m2,                m15
    vpermq          m2,                m2, 0x4E
    psubw           m10,               m4, m2
    paddw           m4,                m2

    movu            m3,                [r0 + r3]
    movu            m2,                [r0 + r3 + 32]
    pshufb          m2,                m15
    vpermq          m2,                m2, 0x4E
    psubw           m11,               m3, m2
    paddw           m3,                m2
    vperm2i128      m2,                m4, m3, 0x20        ; [row1lo  row3lo] for E
    vperm2i128      m8,                m4, m3, 0x31        ; [row1hi  row3hi] for E
    pshufb          m8,                m15
    psubw           m3,                m2, m8
    paddw           m2,                m8

    vperm2i128      m4,                m10, m11, 0x20      ; [row1lo  row3lo] for O
    vperm2i128      m7,                m10, m11, 0x31      ; [row1hi  row3hi] for O


    DCT32_PASS_1    0 * 32,            0 * 64, 0, 2
    DCT32_PASS_1    2 * 32,            2 * 64, 1, 3
    DCT32_PASS_1    4 * 32,            4 * 64, 0, 2
    DCT32_PASS_1    6 * 32,            6 * 64, 1, 3
    DCT32_PASS_1    8 * 32,            8 * 64, 0, 2
    DCT32_PASS_1    10 * 32,           10 * 64, 1, 3
    DCT32_PASS_1    12 * 32,           12 * 64, 0, 2
    DCT32_PASS_1    14 * 32,           14 * 64, 1, 3
    DCT32_PASS_1    16 * 32,           16 * 64, 0, 2
    DCT32_PASS_1    18 * 32,           18 * 64, 1, 3
    DCT32_PASS_1    20 * 32,           20 * 64, 0, 2
    DCT32_PASS_1    22 * 32,           22 * 64, 1, 3
    DCT32_PASS_1    24 * 32,           24 * 64, 0, 2
    DCT32_PASS_1    26 * 32,           26 * 64, 1, 3
    DCT32_PASS_1    28 * 32,           28 * 64, 0, 2
    DCT32_PASS_1    30 * 32,           30 * 64, 1, 3

    add             r5,                8
    lea             r0,                [r0 + r2 * 4]

    dec             r4d
    jnz             .pass1

    mov             r2d,               64
    lea             r3,                [r2 * 3]
    mov             r5,                rsp
    mov             r4d,               8
    vpbroadcastq    m9,                [pd_1024]

.pass2:
    mova            m0,                [r5 + 0 * 64]
    mova            m1,                [r5 + 0 * 64 + 32]

    mova            m2,                [r5 + 1 * 64]
    mova            m3,                [r5 + 1 * 64 + 32]

    mova            m4,                [r5 + 2 * 64]
    mova            m5,                [r5 + 2 * 64 + 32]

    mova            m6,                [r5 + 3 * 64]
    mova            m7,                [r5 + 3 * 64 + 32]

    DCT32_PASS_2    0 * 32
    movq            [r1],              xm11
    DCT32_PASS_2    1 * 32
    movq            [r1 + r2],         xm11
    DCT32_PASS_2    2 * 32
    movq            [r1 + r2 * 2],     xm11
    DCT32_PASS_2    3 * 32
    movq            [r1 + r3],         xm11

    lea             r6,                [r1 + r2 * 4]
    DCT32_PASS_2    4 * 32
    movq            [r6],              xm11
    DCT32_PASS_2    5 * 32
    movq            [r6 + r2],         xm11
    DCT32_PASS_2    6 * 32
    movq            [r6 + r2 * 2],     xm11
    DCT32_PASS_2    7 * 32
    movq            [r6 + r3],         xm11

    lea             r6,                [r6 + r2 * 4]
    DCT32_PASS_2    8 * 32
    movq            [r6],              xm11
    DCT32_PASS_2    9 * 32
    movq            [r6 + r2],         xm11
    DCT32_PASS_2    10 * 32
    movq            [r6 + r2 * 2],     xm11
    DCT32_PASS_2    11 * 32
    movq            [r6 + r3],         xm11

    lea             r6,                [r6 + r2 * 4]
    DCT32_PASS_2    12 * 32
    movq            [r6],              xm11
    DCT32_PASS_2    13 * 32
    movq            [r6 + r2],         xm11
    DCT32_PASS_2    14 * 32
    movq            [r6 + r2 * 2],     xm11
    DCT32_PASS_2    15 * 32
    movq            [r6 + r3],         xm11

    lea             r6,                [r6 + r2 * 4]
    DCT32_PASS_2    16 * 32
    movq            [r6],              xm11
    DCT32_PASS_2    17 * 32
    movq            [r6 + r2],         xm11
    DCT32_PASS_2    18 * 32
    movq            [r6 + r2 * 2],     xm11
    DCT32_PASS_2    19 * 32
    movq            [r6 + r3],         xm11

    lea             r6,                [r6 + r2 * 4]
    DCT32_PASS_2    20 * 32
    movq            [r6],              xm11
    DCT32_PASS_2    21 * 32
    movq            [r6 + r2],         xm11
    DCT32_PASS_2    22 * 32
    movq            [r6 + r2 * 2],     xm11
    DCT32_PASS_2    23 * 32
    movq            [r6 + r3],         xm11

    lea             r6,                [r6 + r2 * 4]
    DCT32_PASS_2    24 * 32
    movq            [r6],              xm11
    DCT32_PASS_2    25 * 32
    movq            [r6 + r2],         xm11
    DCT32_PASS_2    26 * 32
    movq            [r6 + r2 * 2],     xm11
    DCT32_PASS_2    27 * 32
    movq            [r6 + r3],         xm11

    lea             r6,                [r6 + r2 * 4]
    DCT32_PASS_2    28 * 32
    movq            [r6],              xm11
    DCT32_PASS_2    29 * 32
    movq            [r6 + r2],         xm11
    DCT32_PASS_2    30 * 32
    movq            [r6 + r2 * 2],     xm11
    DCT32_PASS_2    31 * 32
    movq            [r6 + r3],         xm11

    add             r5,                256
    add             r1,                8

    dec             r4d
    jnz             .pass2
    RET

%macro IDCT8_PASS_1 1
    vpbroadcastd    m7,                [r5 + %1]
    vpbroadcastd    m10,               [r5 + %1 + 4]
    pmaddwd         m5,                m4, m7
    pmaddwd         m6,                m0, m10
    paddd           m5,                m6

    vpbroadcastd    m7,                [r6 + %1]
    vpbroadcastd    m10,               [r6 + %1 + 4]
    pmaddwd         m6,                m1, m7
    pmaddwd         m3,                m2, m10
    paddd           m6,                m3

    paddd           m3,                m5, m6
    paddd           m3,                m11
    psrad           m3,                IDCT_SHIFT1

    psubd           m5,                m6
    paddd           m5,                m11
    psrad           m5,                IDCT_SHIFT1

    vpbroadcastd    m7,                [r5 + %1 + 32]
    vpbroadcastd    m10,               [r5 + %1 + 36]
    pmaddwd         m6,                m4, m7
    pmaddwd         m8,                m0, m10
    paddd           m6,                m8

    vpbroadcastd    m7,                [r6 + %1 + 32]
    vpbroadcastd    m10,               [r6 + %1 + 36]
    pmaddwd         m8,                m1, m7
    pmaddwd         m9,                m2, m10
    paddd           m8,                m9

    paddd           m9,                m6, m8
    paddd           m9,                m11
    psrad           m9,                IDCT_SHIFT1

    psubd           m6,                m8
    paddd           m6,                m11
    psrad           m6,                IDCT_SHIFT1

    packssdw        m3,                m9
    vpermq          m3,                m3, 0xD8

    packssdw        m6,                m5
    vpermq          m6,                m6, 0xD8
%endmacro

%macro IDCT8_PASS_2 0
    punpcklqdq      m2,                m0, m1
    punpckhqdq      m0,                m1

    pmaddwd         m3,                m2, [r5]
    pmaddwd         m5,                m2, [r5 + 32]
    pmaddwd         m6,                m2, [r5 + 64]
    pmaddwd         m7,                m2, [r5 + 96]
    phaddd          m3,                m5
    phaddd          m6,                m7
    pshufb          m3,                [idct8_shuf2]
    pshufb          m6,                [idct8_shuf2]
    punpcklqdq      m7,                m3, m6
    punpckhqdq      m3,                m6

    pmaddwd         m5,                m0, [r6]
    pmaddwd         m6,                m0, [r6 + 32]
    pmaddwd         m8,                m0, [r6 + 64]
    pmaddwd         m9,                m0, [r6 + 96]
    phaddd          m5,                m6
    phaddd          m8,                m9
    pshufb          m5,                [idct8_shuf2]
    pshufb          m8,                [idct8_shuf2]
    punpcklqdq      m6,                m5, m8
    punpckhqdq      m5,                m8

    paddd           m8,                m7, m6
    paddd           m8,                m12
    psrad           m8,                IDCT_SHIFT2

    psubd           m7,                m6
    paddd           m7,                m12
    psrad           m7,                IDCT_SHIFT2

    pshufb          m7,                [idct8_shuf3]
    packssdw        m8,                 m7

    paddd           m9,                m3, m5
    paddd           m9,                m12
    psrad           m9,                IDCT_SHIFT2

    psubd           m3,                m5
    paddd           m3,                m12
    psrad           m3,                IDCT_SHIFT2

    pshufb          m3,                [idct8_shuf3]
    packssdw        m9,                m3
%endmacro

INIT_YMM avx2
cglobal idct8, 3, 7, 13, 0-8*16
%if BIT_DEPTH == 12
    %define         IDCT_SHIFT2        8
    vpbroadcastd    m12,                [pd_256]
%elif BIT_DEPTH == 10
    %define         IDCT_SHIFT2        10
    vpbroadcastd    m12,                [pd_512]
%elif BIT_DEPTH == 8
    %define         IDCT_SHIFT2        12
    vpbroadcastd    m12,                [pd_2048]
%else
    %error Unsupported BIT_DEPTH!
%endif
%define             IDCT_SHIFT1         7

    vbroadcasti128  m11,               [pd_64]

    mov             r4,                rsp
    lea             r5,                [avx2_idct8_1]
    lea             r6,                [avx2_idct8_2]

    ;pass1
    mova            m1,                [r0 + 0 * 32]     ; [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1]
    mova            m0,                [r0 + 1 * 32]     ; [2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3]
    vpunpcklwd      m5,      m1,       m0                ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
    vpunpckhwd      m1,      m0                          ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
    vinserti128     m4,      m5,       xm1,       1      ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
    vextracti128    xm2,     m5,       1                 ; [1 3 1 3 1 3 1 3]
    vinserti128     m1,      m1,       xm2,       0      ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]

    mova            m2,                [r0 + 2 * 32]     ; [4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5]
    mova            m0,                [r0 + 3 * 32]     ; [6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7]
    vpunpcklwd      m5,      m2,       m0                ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
    vpunpckhwd      m2,      m0                          ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
    vinserti128     m0,      m5,       xm2,       1     ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
    vextracti128    xm5,     m5,       1                ; [5 7 5 7 5 7 5 7]
    vinserti128     m2,      m2,       xm5,       0     ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]

    mova            m5,                [idct8_shuf1]
    vpermd          m4,                m5, m4
    vpermd          m0,                m5, m0
    vpermd          m1,                m5, m1
    vpermd          m2,                m5, m2

    IDCT8_PASS_1    0
    mova            [r4],              m3
    mova            [r4 + 96],         m6

    IDCT8_PASS_1    64
    mova            [r4 + 32],         m3
    mova            [r4 + 64],         m6

    ;pass2
    add             r2d,               r2d
    lea             r3,                [r2 * 3]

    mova            m0,                [r4]
    mova            m1,                [r4 + 32]
    IDCT8_PASS_2

    vextracti128    xm3,               m8, 1
    mova            [r1],              xm8
    mova            [r1 + r2],         xm3
    vextracti128    xm3,               m9, 1
    mova            [r1 + r2 * 2],     xm9
    mova            [r1 + r3],         xm3

    lea             r1,                [r1 + r2 * 4]
    mova            m0,                [r4 + 64]
    mova            m1,                [r4 + 96]
    IDCT8_PASS_2

    vextracti128    xm3,               m8, 1
    mova            [r1],              xm8
    mova            [r1 + r2],         xm3
    vextracti128    xm3,               m9, 1
    mova            [r1 + r2 * 2],     xm9
    mova            [r1 + r3],         xm3
    RET

%macro IDCT_PASS1 2
    vbroadcasti128  m5, [tab_idct16_2 + %1 * 16]

    pmaddwd         m9, m0, m5
    pmaddwd         m10, m7, m5
    phaddd          m9, m10

    pmaddwd         m10, m6, m5
    pmaddwd         m11, m8, m5
    phaddd          m10, m11

    phaddd          m9, m10
    vbroadcasti128  m5, [tab_idct16_1 + %1 * 16]

    pmaddwd         m10, m1, m5
    pmaddwd         m11, m3, m5
    phaddd          m10, m11

    pmaddwd         m11, m4, m5
    pmaddwd         m12, m2, m5
    phaddd          m11, m12

    phaddd          m10, m11

    paddd           m11, m9, m10
    paddd           m11, m14
    psrad           m11, IDCT_SHIFT1

    psubd           m9, m10
    paddd           m9, m14
    psrad           m9, IDCT_SHIFT1

    vbroadcasti128  m5, [tab_idct16_2 + %1 * 16 + 16]

    pmaddwd         m10, m0, m5
    pmaddwd         m12, m7, m5
    phaddd          m10, m12

    pmaddwd         m12, m6, m5
    pmaddwd         m13, m8, m5
    phaddd          m12, m13

    phaddd          m10, m12
    vbroadcasti128  m5, [tab_idct16_1 + %1 * 16  + 16]

    pmaddwd         m12, m1, m5
    pmaddwd         m13, m3, m5
    phaddd          m12, m13

    pmaddwd         m13, m4, m5
    pmaddwd         m5, m2
    phaddd          m13, m5

    phaddd          m12, m13

    paddd           m5, m10, m12
    paddd           m5, m14
    psrad           m5, IDCT_SHIFT1

    psubd           m10, m12
    paddd           m10, m14
    psrad           m10, IDCT_SHIFT1

    packssdw        m11, m5
    packssdw        m9, m10

    mova            m10, [idct16_shuff]
    mova            m5,  [idct16_shuff1]

    vpermd          m12, m10, m11
    vpermd          m13, m5, m9
    mova            [r3 + %1 * 16 * 2], xm12
    mova            [r3 + %2 * 16 * 2], xm13
    vextracti128    [r3 + %2 * 16 * 2 + 32], m13, 1
    vextracti128    [r3 + %1 * 16 * 2 + 32], m12, 1
%endmacro

;-------------------------------------------------------
; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_YMM avx2
cglobal idct16, 3, 7, 16, 0-16*mmsize
%if BIT_DEPTH == 12
    %define         IDCT_SHIFT2        8
    vpbroadcastd    m15,                [pd_256]
%elif BIT_DEPTH == 10
    %define         IDCT_SHIFT2        10
    vpbroadcastd    m15,                [pd_512]
%elif BIT_DEPTH == 8
    %define         IDCT_SHIFT2        12
    vpbroadcastd    m15,                [pd_2048]
%else
    %error Unsupported BIT_DEPTH!
%endif
%define             IDCT_SHIFT1         7

    vbroadcasti128  m14,               [pd_64]

    add             r2d,               r2d
    mov             r3, rsp
    mov             r4d, 2

.pass1:
     movu            xm0, [r0 +  0 * 32]
     movu            xm1, [r0 +  8 * 32]
     punpckhqdq      xm2, xm0, xm1
     punpcklqdq      xm0, xm1
     vinserti128     m0, m0, xm2, 1

     movu            xm1, [r0 +  1 * 32]
     movu            xm2, [r0 +  9 * 32]
     punpckhqdq      xm3, xm1, xm2
     punpcklqdq      xm1, xm2
     vinserti128     m1, m1, xm3, 1

     movu            xm2, [r0 + 2  * 32]
     movu            xm3, [r0 + 10 * 32]
     punpckhqdq      xm4, xm2, xm3
     punpcklqdq      xm2, xm3
     vinserti128     m2, m2, xm4, 1

     movu            xm3, [r0 + 3  * 32]
     movu            xm4, [r0 + 11 * 32]
     punpckhqdq      xm5, xm3, xm4
     punpcklqdq      xm3, xm4
     vinserti128     m3, m3, xm5, 1

     movu            xm4, [r0 + 4  * 32]
     movu            xm5, [r0 + 12 * 32]
     punpckhqdq      xm6, xm4, xm5
     punpcklqdq      xm4, xm5
     vinserti128     m4, m4, xm6, 1

     movu            xm5, [r0 + 5  * 32]
     movu            xm6, [r0 + 13 * 32]
     punpckhqdq      xm7, xm5, xm6
     punpcklqdq      xm5, xm6
     vinserti128     m5, m5, xm7, 1

     movu            xm6, [r0 + 6  * 32]
     movu            xm7, [r0 + 14 * 32]
     punpckhqdq      xm8, xm6, xm7
     punpcklqdq      xm6, xm7
     vinserti128     m6, m6, xm8, 1

     movu            xm7, [r0 + 7  * 32]
     movu            xm8, [r0 + 15 * 32]
     punpckhqdq      xm9, xm7, xm8
     punpcklqdq      xm7, xm8
     vinserti128     m7, m7, xm9, 1

    punpckhwd       m8, m0, m2                ;[8 10]
    punpcklwd       m0, m2                    ;[0 2]

    punpckhwd       m2, m1, m3                ;[9 11]
    punpcklwd       m1, m3                    ;[1 3]

    punpckhwd       m3, m4, m6                ;[12 14]
    punpcklwd       m4, m6                    ;[4 6]

    punpckhwd       m6, m5, m7                ;[13 15]
    punpcklwd       m5, m7                    ;[5 7]

    punpckhdq       m7, m0, m4                ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67]
    punpckldq       m0, m4                    ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65]

    punpckhdq       m4, m8, m3                ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147]
    punpckldq       m8, m3                    ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145]

    punpckhdq       m3, m1, m5                ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77]
    punpckldq       m1, m5                    ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75]

    punpckhdq       m5, m2, m6                ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157]
    punpckldq       m2, m6                    ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155]

    punpckhqdq      m6, m0, m8                ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145]
    punpcklqdq      m0, m8                    ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144]

    punpckhqdq      m8, m7, m4                ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147]
    punpcklqdq      m7, m4                    ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146]

    punpckhqdq      m4, m1, m2                ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155]
    punpcklqdq      m1, m2                    ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154]

    punpckhqdq      m2, m3, m5                ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157]
    punpcklqdq      m3, m5                    ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156]

    IDCT_PASS1      0, 14
    IDCT_PASS1      2, 12
    IDCT_PASS1      4, 10
    IDCT_PASS1      6, 8

    add             r0, 16
    add             r3, 16
    dec             r4d
    jnz             .pass1

    mov             r3, rsp
    mov             r4d, 8
    lea             r5, [tab_idct16_2]
    lea             r6, [tab_idct16_1]

    vbroadcasti128  m7,  [r5]
    vbroadcasti128  m8,  [r5 + 16]
    vbroadcasti128  m9,  [r5 + 32]
    vbroadcasti128  m10, [r5 + 48]
    vbroadcasti128  m11, [r5 + 64]
    vbroadcasti128  m12, [r5 + 80]
    vbroadcasti128  m13, [r5 + 96]

.pass2:
    movu            m1, [r3]
    vpermq          m0, m1, 0xD8

    pmaddwd         m1, m0, m7
    pmaddwd         m2, m0, m8
    phaddd          m1, m2

    pmaddwd         m2, m0, m9
    pmaddwd         m3, m0, m10
    phaddd          m2, m3

    phaddd          m1, m2

    pmaddwd         m2, m0, m11
    pmaddwd         m3, m0, m12
    phaddd          m2, m3

    vbroadcasti128  m14, [r5 + 112]
    pmaddwd         m3, m0, m13
    pmaddwd         m4, m0, m14
    phaddd          m3, m4

    phaddd          m2, m3

    movu            m3, [r3 + 32]
    vpermq          m0, m3, 0xD8

    vbroadcasti128  m14, [r6]
    pmaddwd         m3, m0, m14
    vbroadcasti128  m14, [r6 + 16]
    pmaddwd         m4, m0, m14
    phaddd          m3, m4

    vbroadcasti128  m14, [r6 + 32]
    pmaddwd         m4, m0, m14
    vbroadcasti128  m14, [r6 + 48]
    pmaddwd         m5, m0, m14
    phaddd          m4, m5

    phaddd          m3, m4

    vbroadcasti128  m14, [r6 + 64]
    pmaddwd         m4, m0, m14
    vbroadcasti128  m14, [r6 + 80]
    pmaddwd         m5, m0, m14
    phaddd          m4, m5

    vbroadcasti128  m14, [r6 + 96]
    pmaddwd         m6, m0, m14
    vbroadcasti128  m14, [r6 + 112]
    pmaddwd         m0, m14
    phaddd          m6, m0

    phaddd          m4, m6

    paddd           m5, m1, m3
    paddd           m5, m15
    psrad           m5, IDCT_SHIFT2

    psubd           m1, m3
    paddd           m1, m15
    psrad           m1, IDCT_SHIFT2

    paddd           m6, m2, m4
    paddd           m6, m15
    psrad           m6, IDCT_SHIFT2

    psubd           m2, m4
    paddd           m2, m15
    psrad           m2, IDCT_SHIFT2

    packssdw        m5, m6
    packssdw        m1, m2
    pshufb          m2, m1, [dct16_shuf1]

    mova            [r1], xm5
    mova            [r1 + 16], xm2
    vextracti128    [r1 + r2], m5, 1
    vextracti128    [r1 + r2 + 16], m2, 1

    lea             r1, [r1 + 2 * r2]
    add             r3, 64
    dec             r4d
    jnz             .pass2
    RET

%macro IDCT32_PASS1 1
    vbroadcasti128  m3, [tab_idct32_1 + %1 * 32]
    vbroadcasti128  m13, [tab_idct32_1 + %1 * 32 + 16]
    pmaddwd         m9, m4, m3
    pmaddwd         m10, m8, m13
    phaddd          m9, m10

    pmaddwd         m10, m2, m3
    pmaddwd         m11, m1, m13
    phaddd          m10, m11

    phaddd          m9, m10

    vbroadcasti128  m3, [tab_idct32_1 + (15 - %1) * 32]
    vbroadcasti128  m13, [tab_idct32_1 + (15- %1) * 32 + 16]
    pmaddwd         m10, m4, m3
    pmaddwd         m11, m8, m13
    phaddd          m10, m11

    pmaddwd         m11, m2, m3
    pmaddwd         m12, m1, m13
    phaddd          m11, m12

    phaddd          m10, m11
    phaddd          m9, m10                       ;[row0s0 row2s0 row0s15 row2s15 row1s0 row3s0 row1s15 row3s15]

    vbroadcasti128  m3, [tab_idct32_2 + %1 * 16]
    pmaddwd         m10, m0, m3
    pmaddwd         m11, m7, m3
    phaddd          m10, m11
    phaddd          m10, m10

    vbroadcasti128  m3, [tab_idct32_3 + %1 * 16]
    pmaddwd         m11, m5, m3
    pmaddwd         m12, m6, m3
    phaddd          m11, m12
    phaddd          m11, m11

    paddd           m12, m10, m11                 ;[row0a0 row2a0 NIL NIL row1sa0 row3a0 NIL NIL]
    psubd           m10, m11                      ;[row0a15 row2a15 NIL NIL row1a15 row3a15 NIL NIL]

    punpcklqdq      m12, m10                      ;[row0a0 row2a0 row0a15 row2a15 row1a0 row3a0 row1a15 row3a15]
    paddd           m10, m9, m12
    paddd           m10, m15
    psrad           m10, IDCT_SHIFT1

    psubd           m12, m9
    paddd           m12, m15
    psrad           m12, IDCT_SHIFT1

    packssdw        m10, m12
    vextracti128    xm12, m10, 1
    movd            [r3 + %1 * 64], xm10
    movd            [r3 + 32 + %1 * 64], xm12
    pextrd          [r4 - %1 * 64], xm10, 1
    pextrd          [r4+ 32 - %1 * 64], xm12, 1
    pextrd          [r3 + 16 * 64 + %1 *64], xm10, 3
    pextrd          [r3 + 16 * 64 + 32 + %1 * 64], xm12, 3
    pextrd          [r4 + 16 * 64 - %1 * 64], xm10, 2
    pextrd          [r4 + 16 * 64 + 32 - %1 * 64], xm12, 2
%endmacro

;-------------------------------------------------------
; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------

; TODO: Reduce PHADDD instruction by PADDD

INIT_YMM avx2
cglobal idct32, 3, 6, 16, 0-32*64

%define             IDCT_SHIFT1         7

    vbroadcasti128  m15, [pd_64]

    mov             r3, rsp
    lea             r4, [r3 + 15 * 64]
    mov             r5d, 8

.pass1:
    movq            xm0,    [r0 +  2 * 64]
    movq            xm1,    [r0 + 18 * 64]
    punpcklqdq      xm0, xm0, xm1
    movq            xm1,    [r0 +  0 * 64]
    movq            xm2,    [r0 + 16 * 64]
    punpcklqdq      xm1, xm1, xm2
    vinserti128     m0,  m0,  xm1, 1             ;[2 18 0 16]

    movq            xm1,    [r0 + 1 * 64]
    movq            xm2,    [r0 + 9 * 64]
    punpcklqdq      xm1, xm1, xm2
    movq            xm2,    [r0 + 17 * 64]
    movq            xm3,    [r0 + 25 * 64]
    punpcklqdq      xm2, xm2, xm3
    vinserti128     m1,  m1,  xm2, 1             ;[1 9 17 25]

    movq            xm2,    [r0 + 6 * 64]
    movq            xm3,    [r0 + 22 * 64]
    punpcklqdq      xm2, xm2, xm3
    movq            xm3,    [r0 + 4 * 64]
    movq            xm4,    [r0 + 20 * 64]
    punpcklqdq      xm3, xm3, xm4
    vinserti128     m2,  m2,  xm3, 1             ;[6 22 4 20]

    movq            xm3,    [r0 + 3 * 64]
    movq            xm4,    [r0 + 11 * 64]
    punpcklqdq      xm3, xm3, xm4
    movq            xm4,    [r0 + 19 * 64]
    movq            xm5,    [r0 + 27 * 64]
    punpcklqdq      xm4, xm4, xm5
    vinserti128     m3,  m3,  xm4, 1             ;[3 11 17 25]

    movq            xm4,    [r0 + 10 * 64]
    movq            xm5,    [r0 + 26 * 64]
    punpcklqdq      xm4, xm4, xm5
    movq            xm5,    [r0 + 8 * 64]
    movq            xm6,    [r0 + 24 * 64]
    punpcklqdq      xm5, xm5, xm6
    vinserti128     m4,  m4,  xm5, 1             ;[10 26 8 24]

    movq            xm5,    [r0 + 5 * 64]
    movq            xm6,    [r0 + 13 * 64]
    punpcklqdq      xm5, xm5, xm6
    movq            xm6,    [r0 + 21 * 64]
    movq            xm7,    [r0 + 29 * 64]
    punpcklqdq      xm6, xm6, xm7
    vinserti128     m5,  m5,  xm6, 1             ;[5 13 21 9]

    movq            xm6,    [r0 + 14 * 64]
    movq            xm7,    [r0 + 30 * 64]
    punpcklqdq      xm6, xm6, xm7
    movq            xm7,    [r0 + 12 * 64]
    movq            xm8,    [r0 + 28 * 64]
    punpcklqdq      xm7, xm7, xm8
    vinserti128     m6,  m6,  xm7, 1             ;[14 30 12 28]

    movq            xm7,    [r0 + 7 * 64]
    movq            xm8,    [r0 + 15 * 64]
    punpcklqdq      xm7, xm7, xm8
    movq            xm8,    [r0 + 23 * 64]
    movq            xm9,    [r0 + 31 * 64]
    punpcklqdq      xm8, xm8, xm9
    vinserti128     m7,  m7,  xm8, 1             ;[7 15 23 31]

    punpckhwd       m8, m0, m2                  ;[18 22 16 20]
    punpcklwd       m0, m2                      ;[2 6 0 4]

    punpckhwd       m2, m1, m3                  ;[9 11 25 27]
    punpcklwd       m1, m3                      ;[1 3 17 19]

    punpckhwd       m3, m4, m6                  ;[26 30 24 28]
    punpcklwd       m4, m6                      ;[10 14 8 12]

    punpckhwd       m6, m5, m7                  ;[13 15 29 31]
    punpcklwd       m5, m7                      ;[5 7 21 23]

    punpckhdq       m7, m0, m4                  ;[22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123]
    punpckldq       m0, m4                      ;[20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121]

    punpckhdq       m4, m8, m3                  ;[182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283]
    punpckldq       m8, m3                      ;[180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281]

    punpckhdq       m3, m1, m5                  ;[12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233]
    punpckldq       m1, m5                      ;[10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231]

    punpckhdq       m5, m2, m6                  ;[92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313]
    punpckldq       m2, m6                      ;[90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311]

    punpckhqdq      m6, m0, m8                  ;[21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281]
    punpcklqdq      m0, m8                      ;[20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280]

    punpckhqdq      m8, m7, m4                  ;[23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283]
    punpcklqdq      m7, m4                      ;[22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282]

    punpckhqdq      m4, m1, m2                  ;[11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311]
    punpcklqdq      m1, m2                      ;[10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310]

    punpckhqdq      m2, m3, m5                  ;[13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313]
    punpcklqdq      m3, m5                      ;[12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312]

    vperm2i128      m5, m0, m6, 0x20            ;[20 60 100 140 180 220 260 300 21 61 101 141 181 221 261 301]
    vperm2i128      m0, m0, m6, 0x31            ;[00 40 80 120 160 200 240 280 01 41 81 121 161 201 241 281]

    vperm2i128      m6, m7, m8, 0x20            ;[22 62 102 142 182 222 262 302 23 63 103 143 183 223 263 303]
    vperm2i128      m7, m7, m8, 0x31            ;[02 42 82 122 162 202 242 282 03 43 83 123 163 203 243 283]

    vperm2i128      m8, m1, m4, 0x31            ;[170 190 210 230 250 270 290 310 171 191 211 231 251 271 291 311]
    vperm2i128      m4, m1, m4, 0x20            ;[10 30 50 70 90 110 130 150 11 31 51 71 91 111 131 151]

    vperm2i128      m1, m3, m2, 0x31            ;[172 192 212 232 252 272 292 312 173 193 213 233 253 273 293 313]
    vperm2i128      m2, m3, m2, 0x20            ;[12 32 52 72 92 112 132 152 13 33 53 73 93 113 133 153]

    IDCT32_PASS1 0
    IDCT32_PASS1 1
    IDCT32_PASS1 2
    IDCT32_PASS1 3
    IDCT32_PASS1 4
    IDCT32_PASS1 5
    IDCT32_PASS1 6
    IDCT32_PASS1 7

    add             r0, 8
    add             r3, 4
    add             r4, 4
    dec             r5d
    jnz             .pass1

%if BIT_DEPTH == 12
    %define         IDCT_SHIFT2        8
    vpbroadcastd    m15,                [pd_256]
%elif BIT_DEPTH == 10
    %define         IDCT_SHIFT2        10
    vpbroadcastd    m15,                [pd_512]
%elif BIT_DEPTH == 8
    %define         IDCT_SHIFT2        12
    vpbroadcastd    m15,                [pd_2048]
%else
    %error Unsupported BIT_DEPTH!
%endif

    mov             r3, rsp
    add             r2d, r2d
    mov             r4d, 32

    mova            m7,  [tab_idct32_4]
    mova            m8,  [tab_idct32_4 + 32]
    mova            m9,  [tab_idct32_4 + 64]
    mova            m10, [tab_idct32_4 + 96]
    mova            m11, [tab_idct32_4 + 128]
    mova            m12, [tab_idct32_4 + 160]
    mova            m13, [tab_idct32_4 + 192]
    mova            m14, [tab_idct32_4 + 224]
.pass2:
    movu            m0, [r3]
    movu            m1, [r3 + 32]

    pmaddwd         m2, m0, m7
    pmaddwd         m3, m0, m8
    phaddd          m2, m3

    pmaddwd         m3, m0, m9
    pmaddwd         m4, m0, m10
    phaddd          m3, m4

    phaddd          m2, m3

    pmaddwd         m3, m0, m11
    pmaddwd         m4, m0, m12
    phaddd          m3, m4

    pmaddwd         m4, m0, m13
    pmaddwd         m5, m0, m14
    phaddd          m4, m5

    phaddd          m3, m4

    vperm2i128      m4, m2, m3, 0x31
    vperm2i128      m2, m2, m3, 0x20
    paddd           m2, m4

    pmaddwd         m3, m0, [tab_idct32_4 + 256]
    pmaddwd         m4, m0, [tab_idct32_4 + 288]
    phaddd          m3, m4

    pmaddwd         m4, m0, [tab_idct32_4 + 320]
    pmaddwd         m5, m0, [tab_idct32_4 + 352]
    phaddd          m4, m5

    phaddd          m3, m4

    pmaddwd         m4, m0, [tab_idct32_4 + 384]
    pmaddwd         m5, m0, [tab_idct32_4 + 416]
    phaddd          m4, m5

    pmaddwd         m5, m0, [tab_idct32_4 + 448]
    pmaddwd         m0,     [tab_idct32_4 + 480]
    phaddd          m5, m0

    phaddd          m4, m5

    vperm2i128      m0, m3, m4, 0x31
    vperm2i128      m3, m3, m4, 0x20
    paddd           m3, m0

    pmaddwd         m4, m1, [tab_idct32_1]
    pmaddwd         m0, m1, [tab_idct32_1 + 32]
    phaddd          m4, m0

    pmaddwd         m5, m1, [tab_idct32_1 + 64]
    pmaddwd         m0, m1, [tab_idct32_1 + 96]
    phaddd          m5, m0

    phaddd          m4, m5

    pmaddwd         m5, m1, [tab_idct32_1 + 128]
    pmaddwd         m0, m1, [tab_idct32_1 + 160]
    phaddd          m5, m0

    pmaddwd         m6, m1, [tab_idct32_1 + 192]
    pmaddwd         m0, m1, [tab_idct32_1 + 224]
    phaddd          m6, m0

    phaddd          m5, m6

    vperm2i128      m0, m4, m5, 0x31
    vperm2i128      m4, m4, m5, 0x20
    paddd           m4, m0

    pmaddwd         m5, m1, [tab_idct32_1 + 256]
    pmaddwd         m0, m1, [tab_idct32_1 + 288]
    phaddd          m5, m0

    pmaddwd         m6, m1, [tab_idct32_1 + 320]
    pmaddwd         m0, m1, [tab_idct32_1 + 352]
    phaddd          m6, m0

    phaddd          m5, m6

    pmaddwd         m6, m1, [tab_idct32_1 + 384]
    pmaddwd         m0, m1, [tab_idct32_1 + 416]
    phaddd          m6, m0

    pmaddwd         m0, m1, [tab_idct32_1 + 448]
    pmaddwd         m1,     [tab_idct32_1 + 480]
    phaddd          m0, m1

    phaddd          m6, m0

    vperm2i128      m0, m5, m6, 0x31
    vperm2i128      m5, m5, m6, 0x20
    paddd           m5, m0

    paddd           m6, m2, m4
    paddd           m6, m15
    psrad           m6, IDCT_SHIFT2

    psubd           m2, m4
    paddd           m2, m15
    psrad           m2, IDCT_SHIFT2

    paddd           m4, m3, m5
    paddd           m4, m15
    psrad           m4, IDCT_SHIFT2

    psubd           m3, m5
    paddd           m3, m15
    psrad           m3, IDCT_SHIFT2

    packssdw        m6, m4
    packssdw        m2, m3

    vpermq          m6, m6, 0xD8
    vpermq          m2, m2, 0x8D
    pshufb          m2, [dct16_shuf1]

    mova            [r1], m6
    mova            [r1 + 32], m2

    add             r1, r2
    add             r3, 64
    dec             r4d
    jnz             .pass2
    RET

;-------------------------------------------------------
; void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_YMM avx2
cglobal idct4, 3, 4, 6

%define             IDCT_SHIFT1         7
%if BIT_DEPTH == 12
    %define         IDCT_SHIFT2        8
    vpbroadcastd    m5,                [pd_256]
%elif BIT_DEPTH == 10
    %define         IDCT_SHIFT2        10
    vpbroadcastd    m5,                [pd_512]
%elif BIT_DEPTH == 8
    %define         IDCT_SHIFT2        12
    vpbroadcastd    m5,                [pd_2048]
%else
    %error Unsupported BIT_DEPTH!
%endif
    vbroadcasti128  m4, [pd_64]

    add             r2d, r2d
    lea             r3, [r2 * 3]

    movu            m0, [r0]                      ;[00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33]

    pshufb          m0, [idct4_shuf1]             ;[00 02 01 03 10 12 11 13 20 22 21 23 30 32 31 33]
    vextracti128    xm1, m0, 1                    ;[20 22 21 23 30 32 31 33]
    punpcklwd       xm2, xm0, xm1                 ;[00 20 02 22 01 21 03 23]
    punpckhwd       xm0, xm1                      ;[10 30 12 32 11 31 13 33]
    vinserti128     m2, m2, xm2, 1                ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23]
    vinserti128     m0, m0, xm0, 1                ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33]

    mova            m1, [avx2_idct4_1]
    mova            m3, [avx2_idct4_1 + 32]
    pmaddwd         m1, m2
    pmaddwd         m3, m0

    paddd           m0, m1, m3
    paddd           m0, m4
    psrad           m0, IDCT_SHIFT1               ;[00 20 10 30 01 21 11 31]

    psubd           m1, m3
    paddd           m1, m4
    psrad           m1, IDCT_SHIFT1               ;[03 23 13 33 02 22 12 32]

    packssdw        m0, m1                        ;[00 20 10 30 03 23 13 33 01 21 11 31 02 22 12 32]
    vmovshdup       m1, m0                        ;[10 30 10 30 13 33 13 33 11 31 11 31 12 32 12 32]
    vmovsldup       m0, m0                        ;[00 20 00 20 03 23 03 23 01 21 01 21 02 22 02 22]

    vpbroadcastq    m2, [avx2_idct4_2]
    vpbroadcastq    m3, [avx2_idct4_2 + 8]
    pmaddwd         m0, m2
    pmaddwd         m1, m3

    paddd           m2, m0, m1
    paddd           m2, m5
    psrad           m2, IDCT_SHIFT2               ;[00 01 10 11 30 31 20 21]

    psubd           m0, m1
    paddd           m0, m5
    psrad           m0, IDCT_SHIFT2               ;[03 02 13 12 33 32 23 22]

    pshufb          m0, [idct4_shuf2]             ;[02 03 12 13 32 33 22 23]
    punpcklqdq      m1, m2, m0                    ;[00 01 02 03 10 11 12 13]
    punpckhqdq      m2, m0                        ;[30 31 32 33 20 21 22 23]
    packssdw        m1, m2                        ;[00 01 02 03 30 31 32 33 10 11 12 13 20 21 22 23]
    vextracti128    xm0, m1, 1

    movq            [r1], xm1
    movq            [r1 + r2], xm0
    movhps          [r1 + 2 * r2], xm0
    movhps          [r1 + r3], xm1
    RET
%endif