Spaces:
Runtime error
Runtime error
/* | |
* Copyright (c) 2008 Mans Rullgard <[email protected]> | |
* Copyright (c) 2013 Janne Grunau <[email protected]> | |
* | |
* This file is part of FFmpeg. | |
* | |
* FFmpeg is free software; you can redistribute it and/or | |
* modify it under the terms of the GNU Lesser General Public | |
* License as published by the Free Software Foundation; either | |
* version 2.1 of the License, or (at your option) any later version. | |
* | |
* FFmpeg is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
* Lesser General Public License for more details. | |
* | |
* You should have received a copy of the GNU Lesser General Public | |
* License along with FFmpeg; if not, write to the Free Software | |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
*/ | |
#include "libavutil/aarch64/asm.S" | |
#include "neon.S" | |
/* H.264 qpel MC */ | |
.macro lowpass_const r | |
movz \r, #20, lsl #16 | |
movk \r, #5 | |
mov v6.S[0], \r | |
.endm | |
//trashes v0-v5 | |
.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 | |
ext v2.8B, \r0\().8B, \r1\().8B, #2 | |
ext v3.8B, \r0\().8B, \r1\().8B, #3 | |
uaddl v2.8H, v2.8B, v3.8B | |
ext v4.8B, \r0\().8B, \r1\().8B, #1 | |
ext v5.8B, \r0\().8B, \r1\().8B, #4 | |
uaddl v4.8H, v4.8B, v5.8B | |
ext v1.8B, \r0\().8B, \r1\().8B, #5 | |
uaddl \d0\().8H, \r0\().8B, v1.8B | |
ext v0.8B, \r2\().8B, \r3\().8B, #2 | |
mla \d0\().8H, v2.8H, v6.H[1] | |
ext v1.8B, \r2\().8B, \r3\().8B, #3 | |
uaddl v0.8H, v0.8B, v1.8B | |
ext v1.8B, \r2\().8B, \r3\().8B, #1 | |
mls \d0\().8H, v4.8H, v6.H[0] | |
ext v3.8B, \r2\().8B, \r3\().8B, #4 | |
uaddl v1.8H, v1.8B, v3.8B | |
ext v2.8B, \r2\().8B, \r3\().8B, #5 | |
uaddl \d1\().8H, \r2\().8B, v2.8B | |
mla \d1\().8H, v0.8H, v6.H[1] | |
mls \d1\().8H, v1.8H, v6.H[0] | |
\narrow | |
sqrshrun \d0\().8B, \d0\().8H, #5 | |
sqrshrun \d1\().8B, \d1\().8H, #5 | |
.endm | |
//trashes v0-v4 | |
.macro lowpass_8_v r0, r1, r2, r3, r4, r5, r6, d0, d1, narrow=1 | |
uaddl v2.8H, \r2\().8B, \r3\().8B | |
uaddl v0.8H, \r3\().8B, \r4\().8B | |
uaddl v4.8H, \r1\().8B, \r4\().8B | |
uaddl v1.8H, \r2\().8B, \r5\().8B | |
uaddl \d0\().8H, \r0\().8B, \r5\().8B | |
uaddl \d1\().8H, \r1\().8B, \r6\().8B | |
mla \d0\().8H, v2.8H, v6.H[1] | |
mls \d0\().8H, v4.8H, v6.H[0] | |
mla \d1\().8H, v0.8H, v6.H[1] | |
mls \d1\().8H, v1.8H, v6.H[0] | |
\narrow | |
sqrshrun \d0\().8B, \d0\().8H, #5 | |
sqrshrun \d1\().8B, \d1\().8H, #5 | |
.endm | |
//trashes v0-v5, v7, v30-v31 | |
.macro lowpass_8H r0, r1 | |
ext v0.16B, \r0\().16B, \r0\().16B, #2 | |
ext v1.16B, \r0\().16B, \r0\().16B, #3 | |
uaddl v0.8H, v0.8B, v1.8B | |
ext v2.16B, \r0\().16B, \r0\().16B, #1 | |
ext v3.16B, \r0\().16B, \r0\().16B, #4 | |
uaddl v2.8H, v2.8B, v3.8B | |
ext v30.16B, \r0\().16B, \r0\().16B, #5 | |
uaddl \r0\().8H, \r0\().8B, v30.8B | |
ext v4.16B, \r1\().16B, \r1\().16B, #2 | |
mla \r0\().8H, v0.8H, v6.H[1] | |
ext v5.16B, \r1\().16B, \r1\().16B, #3 | |
uaddl v4.8H, v4.8B, v5.8B | |
ext v7.16B, \r1\().16B, \r1\().16B, #1 | |
mls \r0\().8H, v2.8H, v6.H[0] | |
ext v0.16B, \r1\().16B, \r1\().16B, #4 | |
uaddl v7.8H, v7.8B, v0.8B | |
ext v31.16B, \r1\().16B, \r1\().16B, #5 | |
uaddl \r1\().8H, \r1\().8B, v31.8B | |
mla \r1\().8H, v4.8H, v6.H[1] | |
mls \r1\().8H, v7.8H, v6.H[0] | |
.endm | |
// trashes v2-v5, v30 | |
.macro lowpass_8_1 r0, r1, d0, narrow=1 | |
ext v2.8B, \r0\().8B, \r1\().8B, #2 | |
ext v3.8B, \r0\().8B, \r1\().8B, #3 | |
uaddl v2.8H, v2.8B, v3.8B | |
ext v4.8B, \r0\().8B, \r1\().8B, #1 | |
ext v5.8B, \r0\().8B, \r1\().8B, #4 | |
uaddl v4.8H, v4.8B, v5.8B | |
ext v30.8B, \r0\().8B, \r1\().8B, #5 | |
uaddl \d0\().8H, \r0\().8B, v30.8B | |
mla \d0\().8H, v2.8H, v6.H[1] | |
mls \d0\().8H, v4.8H, v6.H[0] | |
\narrow | |
sqrshrun \d0\().8B, \d0\().8H, #5 | |
.endm | |
// trashed v0-v7 | |
.macro lowpass_8.16 r0, r1, r2, r3, r4, r5 | |
saddl v5.4S, \r2\().4H, \r3\().4H | |
saddl2 v1.4S, \r2\().8H, \r3\().8H | |
saddl v6.4S, \r1\().4H, \r4\().4H | |
saddl2 v2.4S, \r1\().8H, \r4\().8H | |
saddl v0.4S, \r0\().4H, \r5\().4H | |
saddl2 v4.4S, \r0\().8H, \r5\().8H | |
shl v3.4S, v5.4S, #4 | |
shl v5.4S, v5.4S, #2 | |
shl v7.4S, v6.4S, #2 | |
add v5.4S, v5.4S, v3.4S | |
add v6.4S, v6.4S, v7.4S | |
shl v3.4S, v1.4S, #4 | |
shl v1.4S, v1.4S, #2 | |
shl v7.4S, v2.4S, #2 | |
add v1.4S, v1.4S, v3.4S | |
add v2.4S, v2.4S, v7.4S | |
add v5.4S, v5.4S, v0.4S | |
sub v5.4S, v5.4S, v6.4S | |
add v1.4S, v1.4S, v4.4S | |
sub v1.4S, v1.4S, v2.4S | |
rshrn v5.4H, v5.4S, #10 | |
rshrn2 v5.8H, v1.4S, #10 | |
sqxtun \r0\().8B, v5.8H | |
.endm | |
function put_h264_qpel16_h_lowpass_neon_packed | |
mov x4, x30 | |
mov x12, #16 | |
mov x3, #8 | |
bl put_h264_qpel8_h_lowpass_neon | |
sub x1, x1, x2, lsl #4 | |
add x1, x1, #8 | |
mov x12, #16 | |
mov x30, x4 | |
b put_h264_qpel8_h_lowpass_neon | |
endfunc | |
.macro h264_qpel_h_lowpass type | |
function \type\()_h264_qpel16_h_lowpass_neon | |
mov x13, x30 | |
mov x12, #16 | |
bl \type\()_h264_qpel8_h_lowpass_neon | |
sub x0, x0, x3, lsl #4 | |
sub x1, x1, x2, lsl #4 | |
add x0, x0, #8 | |
add x1, x1, #8 | |
mov x12, #16 | |
mov x30, x13 | |
endfunc | |
function \type\()_h264_qpel8_h_lowpass_neon | |
1: ld1 {v28.8B, v29.8B}, [x1], x2 | |
ld1 {v16.8B, v17.8B}, [x1], x2 | |
subs x12, x12, #2 | |
lowpass_8 v28, v29, v16, v17, v28, v16 | |
.ifc \type,avg | |
ld1 {v2.8B}, [x0], x3 | |
ld1 {v3.8B}, [x0] | |
urhadd v28.8B, v28.8B, v2.8B | |
urhadd v16.8B, v16.8B, v3.8B | |
sub x0, x0, x3 | |
st1 {v28.8B}, [x0], x3 | |
st1 {v16.8B}, [x0], x3 | |
b.ne 1b | |
ret | |
endfunc | |
.endm | |
h264_qpel_h_lowpass put | |
h264_qpel_h_lowpass avg | |
.macro h264_qpel_h_lowpass_l2 type | |
function \type\()_h264_qpel16_h_lowpass_l2_neon | |
mov x13, x30 | |
mov x12, #16 | |
bl \type\()_h264_qpel8_h_lowpass_l2_neon | |
sub x0, x0, x2, lsl #4 | |
sub x1, x1, x2, lsl #4 | |
sub x3, x3, x2, lsl #4 | |
add x0, x0, #8 | |
add x1, x1, #8 | |
add x3, x3, #8 | |
mov x12, #16 | |
mov x30, x13 | |
endfunc | |
function \type\()_h264_qpel8_h_lowpass_l2_neon | |
1: ld1 {v26.8B, v27.8B}, [x1], x2 | |
ld1 {v16.8B, v17.8B}, [x1], x2 | |
ld1 {v28.8B}, [x3], x2 | |
ld1 {v29.8B}, [x3], x2 | |
subs x12, x12, #2 | |
lowpass_8 v26, v27, v16, v17, v26, v27 | |
urhadd v26.8B, v26.8B, v28.8B | |
urhadd v27.8B, v27.8B, v29.8B | |
.ifc \type,avg | |
ld1 {v2.8B}, [x0], x2 | |
ld1 {v3.8B}, [x0] | |
urhadd v26.8B, v26.8B, v2.8B | |
urhadd v27.8B, v27.8B, v3.8B | |
sub x0, x0, x2 | |
st1 {v26.8B}, [x0], x2 | |
st1 {v27.8B}, [x0], x2 | |
b.ne 1b | |
ret | |
endfunc | |
.endm | |
h264_qpel_h_lowpass_l2 put | |
h264_qpel_h_lowpass_l2 avg | |
function put_h264_qpel16_v_lowpass_neon_packed | |
mov x4, x30 | |
mov x2, #8 | |
bl put_h264_qpel8_v_lowpass_neon | |
sub x1, x1, x3, lsl #2 | |
bl put_h264_qpel8_v_lowpass_neon | |
sub x1, x1, x3, lsl #4 | |
sub x1, x1, x3, lsl #2 | |
add x1, x1, #8 | |
bl put_h264_qpel8_v_lowpass_neon | |
sub x1, x1, x3, lsl #2 | |
mov x30, x4 | |
b put_h264_qpel8_v_lowpass_neon | |
endfunc | |
.macro h264_qpel_v_lowpass type | |
function \type\()_h264_qpel16_v_lowpass_neon | |
mov x4, x30 | |
bl \type\()_h264_qpel8_v_lowpass_neon | |
sub x1, x1, x3, lsl #2 | |
bl \type\()_h264_qpel8_v_lowpass_neon | |
sub x0, x0, x2, lsl #4 | |
add x0, x0, #8 | |
sub x1, x1, x3, lsl #4 | |
sub x1, x1, x3, lsl #2 | |
add x1, x1, #8 | |
bl \type\()_h264_qpel8_v_lowpass_neon | |
sub x1, x1, x3, lsl #2 | |
mov x30, x4 | |
endfunc | |
function \type\()_h264_qpel8_v_lowpass_neon | |
ld1 {v16.8B}, [x1], x3 | |
ld1 {v17.8B}, [x1], x3 | |
ld1 {v18.8B}, [x1], x3 | |
ld1 {v19.8B}, [x1], x3 | |
ld1 {v20.8B}, [x1], x3 | |
ld1 {v21.8B}, [x1], x3 | |
ld1 {v22.8B}, [x1], x3 | |
ld1 {v23.8B}, [x1], x3 | |
ld1 {v24.8B}, [x1], x3 | |
ld1 {v25.8B}, [x1], x3 | |
ld1 {v26.8B}, [x1], x3 | |
ld1 {v27.8B}, [x1], x3 | |
ld1 {v28.8B}, [x1] | |
lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17 | |
lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19 | |
lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21 | |
lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23 | |
.ifc \type,avg | |
ld1 {v24.8B}, [x0], x2 | |
ld1 {v25.8B}, [x0], x2 | |
ld1 {v26.8B}, [x0], x2 | |
urhadd v16.8B, v16.8B, v24.8B | |
ld1 {v27.8B}, [x0], x2 | |
urhadd v17.8B, v17.8B, v25.8B | |
ld1 {v28.8B}, [x0], x2 | |
urhadd v18.8B, v18.8B, v26.8B | |
ld1 {v29.8B}, [x0], x2 | |
urhadd v19.8B, v19.8B, v27.8B | |
ld1 {v30.8B}, [x0], x2 | |
urhadd v20.8B, v20.8B, v28.8B | |
ld1 {v31.8B}, [x0], x2 | |
urhadd v21.8B, v21.8B, v29.8B | |
urhadd v22.8B, v22.8B, v30.8B | |
urhadd v23.8B, v23.8B, v31.8B | |
sub x0, x0, x2, lsl #3 | |
st1 {v16.8B}, [x0], x2 | |
st1 {v17.8B}, [x0], x2 | |
st1 {v18.8B}, [x0], x2 | |
st1 {v19.8B}, [x0], x2 | |
st1 {v20.8B}, [x0], x2 | |
st1 {v21.8B}, [x0], x2 | |
st1 {v22.8B}, [x0], x2 | |
st1 {v23.8B}, [x0], x2 | |
ret | |
endfunc | |
.endm | |
h264_qpel_v_lowpass put | |
h264_qpel_v_lowpass avg | |
.macro h264_qpel_v_lowpass_l2 type | |
function \type\()_h264_qpel16_v_lowpass_l2_neon | |
mov x4, x30 | |
bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
sub x1, x1, x3, lsl #2 | |
bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
sub x0, x0, x3, lsl #4 | |
sub x12, x12, x2, lsl #4 | |
add x0, x0, #8 | |
add x12, x12, #8 | |
sub x1, x1, x3, lsl #4 | |
sub x1, x1, x3, lsl #2 | |
add x1, x1, #8 | |
bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
sub x1, x1, x3, lsl #2 | |
mov x30, x4 | |
endfunc | |
function \type\()_h264_qpel8_v_lowpass_l2_neon | |
ld1 {v16.8B}, [x1], x3 | |
ld1 {v17.8B}, [x1], x3 | |
ld1 {v18.8B}, [x1], x3 | |
ld1 {v19.8B}, [x1], x3 | |
ld1 {v20.8B}, [x1], x3 | |
ld1 {v21.8B}, [x1], x3 | |
ld1 {v22.8B}, [x1], x3 | |
ld1 {v23.8B}, [x1], x3 | |
ld1 {v24.8B}, [x1], x3 | |
ld1 {v25.8B}, [x1], x3 | |
ld1 {v26.8B}, [x1], x3 | |
ld1 {v27.8B}, [x1], x3 | |
ld1 {v28.8B}, [x1] | |
lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17 | |
lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19 | |
lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21 | |
lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23 | |
ld1 {v24.8B}, [x12], x2 | |
ld1 {v25.8B}, [x12], x2 | |
ld1 {v26.8B}, [x12], x2 | |
ld1 {v27.8B}, [x12], x2 | |
ld1 {v28.8B}, [x12], x2 | |
urhadd v16.8B, v24.8B, v16.8B | |
urhadd v17.8B, v25.8B, v17.8B | |
ld1 {v29.8B}, [x12], x2 | |
urhadd v18.8B, v26.8B, v18.8B | |
urhadd v19.8B, v27.8B, v19.8B | |
ld1 {v30.8B}, [x12], x2 | |
urhadd v20.8B, v28.8B, v20.8B | |
urhadd v21.8B, v29.8B, v21.8B | |
ld1 {v31.8B}, [x12], x2 | |
urhadd v22.8B, v30.8B, v22.8B | |
urhadd v23.8B, v31.8B, v23.8B | |
.ifc \type,avg | |
ld1 {v24.8B}, [x0], x3 | |
ld1 {v25.8B}, [x0], x3 | |
ld1 {v26.8B}, [x0], x3 | |
urhadd v16.8B, v16.8B, v24.8B | |
ld1 {v27.8B}, [x0], x3 | |
urhadd v17.8B, v17.8B, v25.8B | |
ld1 {v28.8B}, [x0], x3 | |
urhadd v18.8B, v18.8B, v26.8B | |
ld1 {v29.8B}, [x0], x3 | |
urhadd v19.8B, v19.8B, v27.8B | |
ld1 {v30.8B}, [x0], x3 | |
urhadd v20.8B, v20.8B, v28.8B | |
ld1 {v31.8B}, [x0], x3 | |
urhadd v21.8B, v21.8B, v29.8B | |
urhadd v22.8B, v22.8B, v30.8B | |
urhadd v23.8B, v23.8B, v31.8B | |
sub x0, x0, x3, lsl #3 | |
st1 {v16.8B}, [x0], x3 | |
st1 {v17.8B}, [x0], x3 | |
st1 {v18.8B}, [x0], x3 | |
st1 {v19.8B}, [x0], x3 | |
st1 {v20.8B}, [x0], x3 | |
st1 {v21.8B}, [x0], x3 | |
st1 {v22.8B}, [x0], x3 | |
st1 {v23.8B}, [x0], x3 | |
ret | |
endfunc | |
.endm | |
h264_qpel_v_lowpass_l2 put | |
h264_qpel_v_lowpass_l2 avg | |
function put_h264_qpel8_hv_lowpass_neon_top | |
lowpass_const w12 | |
ld1 {v16.8H}, [x1], x3 | |
ld1 {v17.8H}, [x1], x3 | |
ld1 {v18.8H}, [x1], x3 | |
ld1 {v19.8H}, [x1], x3 | |
ld1 {v20.8H}, [x1], x3 | |
ld1 {v21.8H}, [x1], x3 | |
ld1 {v22.8H}, [x1], x3 | |
ld1 {v23.8H}, [x1], x3 | |
ld1 {v24.8H}, [x1], x3 | |
ld1 {v25.8H}, [x1], x3 | |
ld1 {v26.8H}, [x1], x3 | |
ld1 {v27.8H}, [x1], x3 | |
ld1 {v28.8H}, [x1] | |
lowpass_8H v16, v17 | |
lowpass_8H v18, v19 | |
lowpass_8H v20, v21 | |
lowpass_8H v22, v23 | |
lowpass_8H v24, v25 | |
lowpass_8H v26, v27 | |
lowpass_8H v28, v29 | |
lowpass_8.16 v16, v17, v18, v19, v20, v21 | |
lowpass_8.16 v17, v18, v19, v20, v21, v22 | |
lowpass_8.16 v18, v19, v20, v21, v22, v23 | |
lowpass_8.16 v19, v20, v21, v22, v23, v24 | |
lowpass_8.16 v20, v21, v22, v23, v24, v25 | |
lowpass_8.16 v21, v22, v23, v24, v25, v26 | |
lowpass_8.16 v22, v23, v24, v25, v26, v27 | |
lowpass_8.16 v23, v24, v25, v26, v27, v28 | |
ret | |
endfunc | |
.macro h264_qpel8_hv_lowpass type | |
function \type\()_h264_qpel8_hv_lowpass_neon | |
mov x10, x30 | |
bl put_h264_qpel8_hv_lowpass_neon_top | |
.ifc \type,avg | |
ld1 {v0.8B}, [x0], x2 | |
ld1 {v1.8B}, [x0], x2 | |
ld1 {v2.8B}, [x0], x2 | |
urhadd v16.8B, v16.8B, v0.8B | |
ld1 {v3.8B}, [x0], x2 | |
urhadd v17.8B, v17.8B, v1.8B | |
ld1 {v4.8B}, [x0], x2 | |
urhadd v18.8B, v18.8B, v2.8B | |
ld1 {v5.8B}, [x0], x2 | |
urhadd v19.8B, v19.8B, v3.8B | |
ld1 {v6.8B}, [x0], x2 | |
urhadd v20.8B, v20.8B, v4.8B | |
ld1 {v7.8B}, [x0], x2 | |
urhadd v21.8B, v21.8B, v5.8B | |
urhadd v22.8B, v22.8B, v6.8B | |
urhadd v23.8B, v23.8B, v7.8B | |
sub x0, x0, x2, lsl #3 | |
st1 {v16.8B}, [x0], x2 | |
st1 {v17.8B}, [x0], x2 | |
st1 {v18.8B}, [x0], x2 | |
st1 {v19.8B}, [x0], x2 | |
st1 {v20.8B}, [x0], x2 | |
st1 {v21.8B}, [x0], x2 | |
st1 {v22.8B}, [x0], x2 | |
st1 {v23.8B}, [x0], x2 | |
ret x10 | |
endfunc | |
.endm | |
h264_qpel8_hv_lowpass put | |
h264_qpel8_hv_lowpass avg | |
.macro h264_qpel8_hv_lowpass_l2 type | |
function \type\()_h264_qpel8_hv_lowpass_l2_neon | |
mov x10, x30 | |
bl put_h264_qpel8_hv_lowpass_neon_top | |
ld1 {v0.8B, v1.8B}, [x2], #16 | |
ld1 {v2.8B, v3.8B}, [x2], #16 | |
urhadd v0.8B, v0.8B, v16.8B | |
urhadd v1.8B, v1.8B, v17.8B | |
ld1 {v4.8B, v5.8B}, [x2], #16 | |
urhadd v2.8B, v2.8B, v18.8B | |
urhadd v3.8B, v3.8B, v19.8B | |
ld1 {v6.8B, v7.8B}, [x2], #16 | |
urhadd v4.8B, v4.8B, v20.8B | |
urhadd v5.8B, v5.8B, v21.8B | |
urhadd v6.8B, v6.8B, v22.8B | |
urhadd v7.8B, v7.8B, v23.8B | |
.ifc \type,avg | |
ld1 {v16.8B}, [x0], x3 | |
ld1 {v17.8B}, [x0], x3 | |
ld1 {v18.8B}, [x0], x3 | |
urhadd v0.8B, v0.8B, v16.8B | |
ld1 {v19.8B}, [x0], x3 | |
urhadd v1.8B, v1.8B, v17.8B | |
ld1 {v20.8B}, [x0], x3 | |
urhadd v2.8B, v2.8B, v18.8B | |
ld1 {v21.8B}, [x0], x3 | |
urhadd v3.8B, v3.8B, v19.8B | |
ld1 {v22.8B}, [x0], x3 | |
urhadd v4.8B, v4.8B, v20.8B | |
ld1 {v23.8B}, [x0], x3 | |
urhadd v5.8B, v5.8B, v21.8B | |
urhadd v6.8B, v6.8B, v22.8B | |
urhadd v7.8B, v7.8B, v23.8B | |
sub x0, x0, x3, lsl #3 | |
st1 {v0.8B}, [x0], x3 | |
st1 {v1.8B}, [x0], x3 | |
st1 {v2.8B}, [x0], x3 | |
st1 {v3.8B}, [x0], x3 | |
st1 {v4.8B}, [x0], x3 | |
st1 {v5.8B}, [x0], x3 | |
st1 {v6.8B}, [x0], x3 | |
st1 {v7.8B}, [x0], x3 | |
ret x10 | |
endfunc | |
.endm | |
h264_qpel8_hv_lowpass_l2 put | |
h264_qpel8_hv_lowpass_l2 avg | |
.macro h264_qpel16_hv type | |
function \type\()_h264_qpel16_hv_lowpass_neon | |
mov x13, x30 | |
bl \type\()_h264_qpel8_hv_lowpass_neon | |
sub x1, x1, x3, lsl #2 | |
bl \type\()_h264_qpel8_hv_lowpass_neon | |
sub x1, x1, x3, lsl #4 | |
sub x1, x1, x3, lsl #2 | |
add x1, x1, #8 | |
sub x0, x0, x2, lsl #4 | |
add x0, x0, #8 | |
bl \type\()_h264_qpel8_hv_lowpass_neon | |
sub x1, x1, x3, lsl #2 | |
mov x30, x13 | |
b \type\()_h264_qpel8_hv_lowpass_neon | |
endfunc | |
function \type\()_h264_qpel16_hv_lowpass_l2_neon | |
mov x13, x30 | |
sub x2, x4, #256 | |
bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
sub x1, x1, x3, lsl #2 | |
bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
sub x1, x1, x3, lsl #4 | |
sub x1, x1, x3, lsl #2 | |
add x1, x1, #8 | |
sub x0, x0, x3, lsl #4 | |
add x0, x0, #8 | |
bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
sub x1, x1, x3, lsl #2 | |
mov x30, x13 | |
b \type\()_h264_qpel8_hv_lowpass_l2_neon | |
endfunc | |
.endm | |
h264_qpel16_hv put | |
h264_qpel16_hv avg | |
.macro h264_qpel8 type | |
function ff_\type\()_h264_qpel8_mc10_neon, =1 | |
lowpass_const w3 | |
mov x3, x1 | |
sub x1, x1, #2 | |
mov x12, #8 | |
b \type\()_h264_qpel8_h_lowpass_l2_neon | |
endfunc | |
function ff_\type\()_h264_qpel8_mc20_neon, =1 | |
lowpass_const w3 | |
sub x1, x1, #2 | |
mov x3, x2 | |
mov x12, #8 | |
b \type\()_h264_qpel8_h_lowpass_neon | |
endfunc | |
function ff_\type\()_h264_qpel8_mc30_neon, =1 | |
lowpass_const w3 | |
add x3, x1, #1 | |
sub x1, x1, #2 | |
mov x12, #8 | |
b \type\()_h264_qpel8_h_lowpass_l2_neon | |
endfunc | |
function ff_\type\()_h264_qpel8_mc01_neon, =1 | |
mov x14, x30 | |
mov x12, x1 | |
\type\()_h264_qpel8_mc01: | |
lowpass_const w3 | |
mov x3, x2 | |
sub x1, x1, x2, lsl #1 | |
bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
ret x14 | |
endfunc | |
function ff_\type\()_h264_qpel8_mc11_neon, =1 | |
mov x14, x30 | |
mov x8, x0 | |
mov x9, x1 | |
\type\()_h264_qpel8_mc11: | |
lowpass_const w3 | |
mov x11, sp | |
sub sp, sp, #64 | |
mov x0, sp | |
sub x1, x1, #2 | |
mov x3, #8 | |
mov x12, #8 | |
bl put_h264_qpel8_h_lowpass_neon | |
mov x0, x8 | |
mov x3, x2 | |
mov x12, sp | |
sub x1, x9, x2, lsl #1 | |
mov x2, #8 | |
bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
mov sp, x11 | |
ret x14 | |
endfunc | |
function ff_\type\()_h264_qpel8_mc21_neon, =1 | |
mov x14, x30 | |
mov x8, x0 | |
mov x9, x1 | |
\type\()_h264_qpel8_mc21: | |
lowpass_const w3 | |
mov x11, sp | |
sub sp, sp, #(8*8+16*12) | |
sub x1, x1, #2 | |
mov x3, #8 | |
mov x0, sp | |
mov x12, #8 | |
bl put_h264_qpel8_h_lowpass_neon | |
mov x4, x0 | |
mov x0, x8 | |
sub x1, x9, x2, lsl #1 | |
sub x1, x1, #2 | |
mov x3, x2 | |
sub x2, x4, #64 | |
bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
mov sp, x11 | |
ret x14 | |
endfunc | |
function ff_\type\()_h264_qpel8_mc31_neon, =1 | |
add x1, x1, #1 | |
mov x14, x30 | |
mov x8, x0 | |
mov x9, x1 | |
sub x1, x1, #1 | |
b \type\()_h264_qpel8_mc11 | |
endfunc | |
function ff_\type\()_h264_qpel8_mc02_neon, =1 | |
mov x14, x30 | |
lowpass_const w3 | |
sub x1, x1, x2, lsl #1 | |
mov x3, x2 | |
bl \type\()_h264_qpel8_v_lowpass_neon | |
ret x14 | |
endfunc | |
function ff_\type\()_h264_qpel8_mc12_neon, =1 | |
mov x14, x30 | |
mov x8, x0 | |
mov x9, x1 | |
\type\()_h264_qpel8_mc12: | |
lowpass_const w3 | |
mov x11, sp | |
sub sp, sp, #(8*8+16*12) | |
sub x1, x1, x2, lsl #1 | |
mov x3, x2 | |
mov x2, #8 | |
mov x0, sp | |
bl put_h264_qpel8_v_lowpass_neon | |
mov x4, x0 | |
mov x0, x8 | |
sub x1, x9, x3, lsl #1 | |
sub x1, x1, #2 | |
sub x2, x4, #64 | |
bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
mov sp, x11 | |
ret x14 | |
endfunc | |
function ff_\type\()_h264_qpel8_mc22_neon, =1 | |
mov x14, x30 | |
mov x11, sp | |
sub x1, x1, x2, lsl #1 | |
sub x1, x1, #2 | |
mov x3, x2 | |
bl \type\()_h264_qpel8_hv_lowpass_neon | |
mov sp, x11 | |
ret x14 | |
endfunc | |
function ff_\type\()_h264_qpel8_mc32_neon, =1 | |
mov x14, x30 | |
mov x8, x0 | |
mov x9, x1 | |
add x1, x1, #1 | |
b \type\()_h264_qpel8_mc12 | |
endfunc | |
function ff_\type\()_h264_qpel8_mc03_neon, =1 | |
mov x14, x30 | |
add x12, x1, x2 | |
b \type\()_h264_qpel8_mc01 | |
endfunc | |
function ff_\type\()_h264_qpel8_mc13_neon, =1 | |
mov x14, x30 | |
mov x8, x0 | |
mov x9, x1 | |
add x1, x1, x2 | |
b \type\()_h264_qpel8_mc11 | |
endfunc | |
function ff_\type\()_h264_qpel8_mc23_neon, =1 | |
mov x14, x30 | |
mov x8, x0 | |
mov x9, x1 | |
add x1, x1, x2 | |
b \type\()_h264_qpel8_mc21 | |
endfunc | |
function ff_\type\()_h264_qpel8_mc33_neon, =1 | |
add x1, x1, #1 | |
mov x14, x30 | |
mov x8, x0 | |
mov x9, x1 | |
add x1, x1, x2 | |
sub x1, x1, #1 | |
b \type\()_h264_qpel8_mc11 | |
endfunc | |
.endm | |
h264_qpel8 put | |
h264_qpel8 avg | |
.macro h264_qpel16 type | |
function ff_\type\()_h264_qpel16_mc10_neon, =1 | |
lowpass_const w3 | |
mov x3, x1 | |
sub x1, x1, #2 | |
b \type\()_h264_qpel16_h_lowpass_l2_neon | |
endfunc | |
function ff_\type\()_h264_qpel16_mc20_neon, =1 | |
lowpass_const w3 | |
sub x1, x1, #2 | |
mov x3, x2 | |
b \type\()_h264_qpel16_h_lowpass_neon | |
endfunc | |
function ff_\type\()_h264_qpel16_mc30_neon, =1 | |
lowpass_const w3 | |
add x3, x1, #1 | |
sub x1, x1, #2 | |
b \type\()_h264_qpel16_h_lowpass_l2_neon | |
endfunc | |
function ff_\type\()_h264_qpel16_mc01_neon, =1 | |
mov x14, x30 | |
mov x12, x1 | |
\type\()_h264_qpel16_mc01: | |
lowpass_const w3 | |
mov x3, x2 | |
sub x1, x1, x2, lsl #1 | |
bl \type\()_h264_qpel16_v_lowpass_l2_neon | |
ret x14 | |
endfunc | |
function ff_\type\()_h264_qpel16_mc11_neon, =1 | |
mov x14, x30 | |
mov x8, x0 | |
mov x9, x1 | |
\type\()_h264_qpel16_mc11: | |
lowpass_const w3 | |
mov x11, sp | |
sub sp, sp, #256 | |
mov x0, sp | |
sub x1, x1, #2 | |
mov x3, #16 | |
bl put_h264_qpel16_h_lowpass_neon | |
mov x0, x8 | |
mov x3, x2 | |
mov x12, sp | |
sub x1, x9, x2, lsl #1 | |
mov x2, #16 | |
bl \type\()_h264_qpel16_v_lowpass_l2_neon | |
mov sp, x11 | |
ret x14 | |
endfunc | |
function ff_\type\()_h264_qpel16_mc21_neon, =1 | |
mov x14, x30 | |
mov x8, x0 | |
mov x9, x1 | |
\type\()_h264_qpel16_mc21: | |
lowpass_const w3 | |
mov x11, sp | |
sub sp, sp, #(16*16+16*12) | |
sub x1, x1, #2 | |
mov x0, sp | |
bl put_h264_qpel16_h_lowpass_neon_packed | |
mov x4, x0 | |
mov x0, x8 | |
sub x1, x9, x2, lsl #1 | |
sub x1, x1, #2 | |
mov x3, x2 | |
bl \type\()_h264_qpel16_hv_lowpass_l2_neon | |
mov sp, x11 | |
ret x14 | |
endfunc | |
function ff_\type\()_h264_qpel16_mc31_neon, =1 | |
add x1, x1, #1 | |
mov x14, x30 | |
mov x8, x0 | |
mov x9, x1 | |
sub x1, x1, #1 | |
b \type\()_h264_qpel16_mc11 | |
endfunc | |
function ff_\type\()_h264_qpel16_mc02_neon, =1 | |
mov x14, x30 | |
lowpass_const w3 | |
sub x1, x1, x2, lsl #1 | |
mov x3, x2 | |
bl \type\()_h264_qpel16_v_lowpass_neon | |
ret x14 | |
endfunc | |
function ff_\type\()_h264_qpel16_mc12_neon, =1 | |
mov x14, x30 | |
mov x8, x0 | |
mov x9, x1 | |
\type\()_h264_qpel16_mc12: | |
lowpass_const w3 | |
mov x11, sp | |
sub sp, sp, #(16*16+16*12) | |
sub x1, x1, x2, lsl #1 | |
mov x0, sp | |
mov x3, x2 | |
bl put_h264_qpel16_v_lowpass_neon_packed | |
mov x4, x0 | |
mov x0, x8 | |
sub x1, x9, x3, lsl #1 | |
sub x1, x1, #2 | |
mov x2, x3 | |
bl \type\()_h264_qpel16_hv_lowpass_l2_neon | |
mov sp, x11 | |
ret x14 | |
endfunc | |
function ff_\type\()_h264_qpel16_mc22_neon, =1 | |
mov x14, x30 | |
lowpass_const w3 | |
mov x11, sp | |
sub x1, x1, x2, lsl #1 | |
sub x1, x1, #2 | |
mov x3, x2 | |
bl \type\()_h264_qpel16_hv_lowpass_neon | |
mov sp, x11 // restore stack | |
ret x14 | |
endfunc | |
function ff_\type\()_h264_qpel16_mc32_neon, =1 | |
mov x14, x30 | |
mov x8, x0 | |
mov x9, x1 | |
add x1, x1, #1 | |
b \type\()_h264_qpel16_mc12 | |
endfunc | |
function ff_\type\()_h264_qpel16_mc03_neon, =1 | |
mov x14, x30 | |
add x12, x1, x2 | |
b \type\()_h264_qpel16_mc01 | |
endfunc | |
function ff_\type\()_h264_qpel16_mc13_neon, =1 | |
mov x14, x30 | |
mov x8, x0 | |
mov x9, x1 | |
add x1, x1, x2 | |
b \type\()_h264_qpel16_mc11 | |
endfunc | |
function ff_\type\()_h264_qpel16_mc23_neon, =1 | |
mov x14, x30 | |
mov x8, x0 | |
mov x9, x1 | |
add x1, x1, x2 | |
b \type\()_h264_qpel16_mc21 | |
endfunc | |
function ff_\type\()_h264_qpel16_mc33_neon, =1 | |
add x1, x1, #1 | |
mov x14, x30 | |
mov x8, x0 | |
mov x9, x1 | |
add x1, x1, x2 | |
sub x1, x1, #1 | |
b \type\()_h264_qpel16_mc11 | |
endfunc | |
.endm | |
h264_qpel16 put | |
h264_qpel16 avg | |