Spaces:
Runtime error
Runtime error
/* -*-arm64-*- | |
* vim: syntax=arm64asm | |
* | |
* Copyright (c) 2022 J. Dekker <[email protected]> | |
* | |
* This file is part of FFmpeg. | |
* | |
* FFmpeg is free software; you can redistribute it and/or | |
* modify it under the terms of the GNU Lesser General Public | |
* License as published by the Free Software Foundation; either | |
* version 2.1 of the License, or (at your option) any later version. | |
* | |
* FFmpeg is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
* Lesser General Public License for more details. | |
* | |
* You should have received a copy of the GNU Lesser General Public | |
* License along with FFmpeg; if not, write to the Free Software | |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
*/ | |
#include "libavutil/aarch64/asm.S" | |
#define MAX_PB_SIZE 64 | |
const qpel_filters, =4 | |
0, 0, 0, 0, 0, 0, 0, 0 | |
1, 4,-10, 58, 17, -5, 1, 0 | -|
1, 4,-11, 40, 40,-11, 4, -1 | -|
0, 1, -5, 17, 58,-10, 4, -1 | |
endconst | |
.macro load_filter m | |
movrel x15, qpel_filters | |
add x15, x15, \m, lsl #3 | |
ld1 {v0.8b}, [x15] | |
sxtl v0.8h, v0.8b | |
.endm | |
.macro put_hevc type | |
.ifc \type, qpel | |
// void put_hevc_qpel_h(int16_t *dst, | |
// uint8_t *_src, ptrdiff_t _srcstride, | |
// int height, intptr_t mx, intptr_t my, int width) | |
dst | x0|
dststride | x7|
src | x1|
srcstride | x2|
height | x3|
heightw | w3|
mx | x4|
width | w6|
.endif | |
.ifc \type, qpel_uni | |
// void put_hevc_qpel_uni_h(uint8_t *_dst, ptrdiff_t _dststride, | |
// uint8_t *_src, ptrdiff_t _srcstride, | |
// int height, intptr_t mx, intptr_t my, int width) | |
dst | x0|
dststride | x1|
src | x2|
srcstride | x3|
height | x4|
heightw | w4|
mx | x5|
width | w7|
.endif | |
.ifc \type, qpel_bi | |
// void put_hevc_qpel_bi_h(uint8_t *_dst, ptrdiff_t _dststride, | |
// uint8_t *_src, ptrdiff_t _srcstride, | |
// int16_t *src2, int height, intptr_t mx, | |
// intptr_t my, int width) | |
dst | x0|
dststride | x1|
src | x2|
srcstride | x3|
height | x5|
heightw | w5|
mx | x6|
width | w8|
.endif | |
.ifc \type, qpel | |
function ff_hevc_put_hevc_h4_8_neon, =0 | |
uxtl v16.8h, v16.8b | |
uxtl v17.8h, v17.8b | |
uxtl v18.8h, v18.8b | |
uxtl v19.8h, v19.8b | |
mul v23.4h, v16.4h, v0.h[0] | |
mul v24.4h, v18.4h, v0.h[0] | |
.irpc i, 1234567 | |
ext v20.16b, v16.16b, v17.16b, #(2*\i) | |
ext v21.16b, v18.16b, v19.16b, #(2*\i) | |
mla v23.4h, v20.4h, v0.h[\i] | |
mla v24.4h, v21.4h, v0.h[\i] | |
.endr | |
ret | |
endfunc | |
.endif | |
function ff_hevc_put_hevc_\type\()_h4_8_neon, =1 | |
load_filter mx | |
.ifc \type, qpel_bi | |
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel | |
add x15, x4, #(MAX_PB_SIZE << 1) // src2b | |
.endif | |
sub src, src, #3 | |
mov mx, x30 | |
.ifc \type, qpel | |
mov dststride, #(MAX_PB_SIZE << 1) | |
lsl x13, srcstride, #1 // srcstridel | |
mov x14, #(MAX_PB_SIZE << 2) | |
.else | |
lsl x14, dststride, #1 // dststridel | |
lsl x13, srcstride, #1 // srcstridel | |
.endif | |
add x10, dst, dststride // dstb | |
add x12, src, srcstride // srcb | |
0: ld1 {v16.8b, v17.8b}, [src], x13 | |
ld1 {v18.8b, v19.8b}, [x12], x13 | |
.ifc \type, qpel_bi | |
ld1 {v25.8h}, [ x4], x16 | |
ld1 {v26.8h}, [x15], x16 | |
.endif | |
bl ff_hevc_put_hevc_h4_8_neon | |
subs heightw, heightw, #2 | |
.ifc \type, qpel | |
st1 {v23.4h}, [dst], x14 | |
st1 {v24.4h}, [x10], x14 | |
.else | |
.ifc \type, qpel_bi | |
sqadd v23.4h, v23.4h, v25.4h | |
sqadd v24.4h, v24.4h, v26.4h | |
sqrshrun v23.8b, v23.8h, #7 | |
sqrshrun v24.8b, v24.8h, #7 | |
.else | |
sqrshrun v23.8b, v23.8h, #6 | |
sqrshrun v24.8b, v24.8h, #6 | |
.endif | |
st1 {v23.s}[0], [dst], x14 | |
st1 {v24.s}[0], [x10], x14 | |
.endif | |
b.gt 0b // double line | |
ret mx | |
endfunc | |
.ifc \type, qpel | |
function ff_hevc_put_hevc_h8_8_neon, =0 | |
uxtl v16.8h, v16.8b | |
uxtl v17.8h, v17.8b | |
uxtl v18.8h, v18.8b | |
uxtl v19.8h, v19.8b | |
mul v23.8h, v16.8h, v0.h[0] | |
mul v24.8h, v18.8h, v0.h[0] | |
.irpc i, 1234567 | |
ext v20.16b, v16.16b, v17.16b, #(2*\i) | |
ext v21.16b, v18.16b, v19.16b, #(2*\i) | |
mla v23.8h, v20.8h, v0.h[\i] | |
mla v24.8h, v21.8h, v0.h[\i] | |
.endr | |
ret | |
endfunc | |
.endif | |
function ff_hevc_put_hevc_\type\()_h6_8_neon, =1 | |
load_filter mx | |
.ifc \type, qpel_bi | |
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel | |
add x15, x4, #(MAX_PB_SIZE << 1) // src2b | |
.endif | |
sub src, src, #3 | |
mov mx, x30 | |
.ifc \type, qpel | |
mov dststride, #(MAX_PB_SIZE << 1) | |
lsl x13, srcstride, #1 // srcstridel | |
mov x14, #((MAX_PB_SIZE << 2) - 8) | |
.else | |
lsl x14, dststride, #1 // dststridel | |
lsl x13, srcstride, #1 // srcstridel | |
sub x14, x14, #4 | |
.endif | |
add x10, dst, dststride // dstb | |
add x12, src, srcstride // srcb | |
0: ld1 {v16.8b, v17.8b}, [src], x13 | |
ld1 {v18.8b, v19.8b}, [x12], x13 | |
.ifc \type, qpel_bi | |
ld1 {v25.8h}, [ x4], x16 | |
ld1 {v26.8h}, [x15], x16 | |
.endif | |
bl ff_hevc_put_hevc_h8_8_neon | |
subs heightw, heightw, #2 | |
.ifc \type, qpel | |
st1 {v23.4h}, [dst], #8 | |
st1 {v24.4h}, [x10], #8 | |
st1 {v23.s}[2], [dst], x14 | |
st1 {v24.s}[2], [x10], x14 | |
.else | |
.ifc \type, qpel_bi | |
sqadd v23.8h, v23.8h, v25.8h | |
sqadd v24.8h, v24.8h, v26.8h | |
sqrshrun v23.8b, v23.8h, #7 | |
sqrshrun v24.8b, v24.8h, #7 | |
.else | |
sqrshrun v23.8b, v23.8h, #6 | |
sqrshrun v24.8b, v24.8h, #6 | |
.endif | |
st1 {v23.s}[0], [dst], #4 | |
st1 {v24.s}[0], [x10], #4 | |
st1 {v23.h}[2], [dst], x14 | |
st1 {v24.h}[2], [x10], x14 | |
.endif | |
b.gt 0b // double line | |
ret mx | |
endfunc | |
function ff_hevc_put_hevc_\type\()_h8_8_neon, =1 | |
load_filter mx | |
.ifc \type, qpel_bi | |
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel | |
add x15, x4, #(MAX_PB_SIZE << 1) // src2b | |
.endif | |
sub src, src, #3 | |
mov mx, x30 | |
.ifc \type, qpel | |
mov dststride, #(MAX_PB_SIZE << 1) | |
lsl x13, srcstride, #1 // srcstridel | |
mov x14, #(MAX_PB_SIZE << 2) | |
.else | |
lsl x14, dststride, #1 // dststridel | |
lsl x13, srcstride, #1 // srcstridel | |
.endif | |
add x10, dst, dststride // dstb | |
add x12, src, srcstride // srcb | |
0: ld1 {v16.8b, v17.8b}, [src], x13 | |
ld1 {v18.8b, v19.8b}, [x12], x13 | |
.ifc \type, qpel_bi | |
ld1 {v25.8h}, [ x4], x16 | |
ld1 {v26.8h}, [x15], x16 | |
.endif | |
bl ff_hevc_put_hevc_h8_8_neon | |
subs heightw, heightw, #2 | |
.ifc \type, qpel | |
st1 {v23.8h}, [dst], x14 | |
st1 {v24.8h}, [x10], x14 | |
.else | |
.ifc \type, qpel_bi | |
sqadd v23.8h, v23.8h, v25.8h | |
sqadd v24.8h, v24.8h, v26.8h | |
sqrshrun v23.8b, v23.8h, #7 | |
sqrshrun v24.8b, v24.8h, #7 | |
.else | |
sqrshrun v23.8b, v23.8h, #6 | |
sqrshrun v24.8b, v24.8h, #6 | |
.endif | |
st1 {v23.8b}, [dst], x14 | |
st1 {v24.8b}, [x10], x14 | |
.endif | |
b.gt 0b // double line | |
ret mx | |
endfunc | |
.ifc \type, qpel | |
function ff_hevc_put_hevc_h16_8_neon, =0 | |
uxtl v16.8h, v16.8b | |
uxtl v17.8h, v17.8b | |
uxtl v18.8h, v18.8b | |
uxtl v19.8h, v19.8b | |
uxtl v20.8h, v20.8b | |
uxtl v21.8h, v21.8b | |
mul v26.8h, v16.8h, v0.h[0] | |
mul v27.8h, v17.8h, v0.h[0] | |
mul v28.8h, v19.8h, v0.h[0] | |
mul v29.8h, v20.8h, v0.h[0] | |
.irpc i, 1234567 | |
ext v22.16b, v16.16b, v17.16b, #(2*\i) | |
ext v23.16b, v17.16b, v18.16b, #(2*\i) | |
ext v24.16b, v19.16b, v20.16b, #(2*\i) | |
ext v25.16b, v20.16b, v21.16b, #(2*\i) | |
mla v26.8h, v22.8h, v0.h[\i] | |
mla v27.8h, v23.8h, v0.h[\i] | |
mla v28.8h, v24.8h, v0.h[\i] | |
mla v29.8h, v25.8h, v0.h[\i] | |
.endr | |
subs x9, x9, #2 | |
ret | |
endfunc | |
.endif | |
function ff_hevc_put_hevc_\type\()_h12_8_neon, =1 | |
load_filter mx | |
sxtw height, heightw | |
.ifc \type, qpel_bi | |
ldrh w8, [sp] // width | |
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel | |
lsl x17, height, #7 // src2b reset (height * (MAX_PB_SIZE << 1)) | |
add x15, x4, #(MAX_PB_SIZE << 1) // src2b | |
.endif | |
sub src, src, #3 | |
mov mx, x30 | |
.ifc \type, qpel | |
mov dststride, #(MAX_PB_SIZE << 1) | |
lsl x13, srcstride, #1 // srcstridel | |
mov x14, #((MAX_PB_SIZE << 2) - 16) | |
.else | |
lsl x14, dststride, #1 // dststridel | |
lsl x13, srcstride, #1 // srcstridel | |
sub x14, x14, #8 | |
.endif | |
add x10, dst, dststride // dstb | |
add x12, src, srcstride // srcb | |
0: mov x9, height | |
1: ld1 {v16.8b-v18.8b}, [src], x13 | |
ld1 {v19.8b-v21.8b}, [x12], x13 | |
bl ff_hevc_put_hevc_h16_8_neon | |
.ifc \type, qpel | |
st1 {v26.8h}, [dst], #16 | |
st1 {v28.8h}, [x10], #16 | |
st1 {v27.4h}, [dst], x14 | |
st1 {v29.4h}, [x10], x14 | |
.else | |
.ifc \type, qpel_bi | |
ld1 {v16.8h, v17.8h}, [ x4], x16 | |
ld1 {v18.8h, v19.8h}, [x15], x16 | |
sqadd v26.8h, v26.8h, v16.8h | |
sqadd v27.8h, v27.8h, v17.8h | |
sqadd v28.8h, v28.8h, v18.8h | |
sqadd v29.8h, v29.8h, v19.8h | |
sqrshrun v26.8b, v26.8h, #7 | |
sqrshrun v27.8b, v27.8h, #7 | |
sqrshrun v28.8b, v28.8h, #7 | |
sqrshrun v29.8b, v29.8h, #7 | |
.else | |
sqrshrun v26.8b, v26.8h, #6 | |
sqrshrun v27.8b, v27.8h, #6 | |
sqrshrun v28.8b, v28.8h, #6 | |
sqrshrun v29.8b, v29.8h, #6 | |
.endif | |
st1 {v26.8b}, [dst], #8 | |
st1 {v28.8b}, [x10], #8 | |
st1 {v27.s}[0], [dst], x14 | |
st1 {v29.s}[0], [x10], x14 | |
.endif | |
b.gt 1b // double line | |
subs width, width, #12 | |
// reset src | |
msub src, srcstride, height, src | |
msub x12, srcstride, height, x12 | |
// reset dst | |
msub dst, dststride, height, dst | |
msub x10, dststride, height, x10 | |
.ifc \type, qpel_bi | |
// reset xsrc | |
sub x4, x4, x17 | |
sub x15, x15, x17 | |
add x4, x4, #24 | |
add x15, x15, #24 | |
.endif | |
add src, src, #12 | |
add x12, x12, #12 | |
.ifc \type, qpel | |
add dst, dst, #24 | |
add x10, x10, #24 | |
.else | |
add dst, dst, #12 | |
add x10, x10, #12 | |
.endif | |
b.gt 0b | |
ret mx | |
endfunc | |
function ff_hevc_put_hevc_\type\()_h16_8_neon, =1 | |
load_filter mx | |
sxtw height, heightw | |
mov mx, x30 | |
.ifc \type, qpel_bi | |
ldrh w8, [sp] // width | |
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel | |
lsl x17, x5, #7 // src2b reset | |
add x15, x4, #(MAX_PB_SIZE << 1) // src2b | |
.endif | |
sub src, src, #3 | |
mov mx, x30 | |
.ifc \type, qpel | |
mov dststride, #(MAX_PB_SIZE << 1) | |
lsl x13, srcstride, #1 // srcstridel | |
mov x14, #((MAX_PB_SIZE << 2) - 16) | |
.else | |
lsl x14, dststride, #1 // dststridel | |
lsl x13, srcstride, #1 // srcstridel | |
sub x14, x14, #8 | |
.endif | |
add x10, dst, dststride // dstb | |
add x12, src, srcstride // srcb | |
0: mov x9, height | |
1: ld1 {v16.8b-v18.8b}, [src], x13 | |
ld1 {v19.8b-v21.8b}, [x12], x13 | |
bl ff_hevc_put_hevc_h16_8_neon | |
.ifc \type, qpel | |
st1 {v26.8h}, [dst], #16 | |
st1 {v28.8h}, [x10], #16 | |
st1 {v27.8h}, [dst], x14 | |
st1 {v29.8h}, [x10], x14 | |
.else | |
.ifc \type, qpel_bi | |
ld1 {v16.8h, v17.8h}, [ x4], x16 | |
ld1 {v18.8h, v19.8h}, [x15], x16 | |
sqadd v26.8h, v26.8h, v16.8h | |
sqadd v27.8h, v27.8h, v17.8h | |
sqadd v28.8h, v28.8h, v18.8h | |
sqadd v29.8h, v29.8h, v19.8h | |
sqrshrun v26.8b, v26.8h, #7 | |
sqrshrun v27.8b, v27.8h, #7 | |
sqrshrun v28.8b, v28.8h, #7 | |
sqrshrun v29.8b, v29.8h, #7 | |
.else | |
sqrshrun v26.8b, v26.8h, #6 | |
sqrshrun v27.8b, v27.8h, #6 | |
sqrshrun v28.8b, v28.8h, #6 | |
sqrshrun v29.8b, v29.8h, #6 | |
.endif | |
st1 {v26.8b}, [dst], #8 | |
st1 {v28.8b}, [x10], #8 | |
st1 {v27.8b}, [dst], x14 | |
st1 {v29.8b}, [x10], x14 | |
.endif | |
b.gt 1b // double line | |
subs width, width, #16 | |
// reset src | |
msub src, srcstride, height, src | |
msub x12, srcstride, height, x12 | |
// reset dst | |
msub dst, dststride, height, dst | |
msub x10, dststride, height, x10 | |
.ifc \type, qpel_bi | |
// reset xsrc | |
sub x4, x4, x17 | |
sub x15, x15, x17 | |
add x4, x4, #32 | |
add x15, x15, #32 | |
.endif | |
add src, src, #16 | |
add x12, x12, #16 | |
.ifc \type, qpel | |
add dst, dst, #32 | |
add x10, x10, #32 | |
.else | |
add dst, dst, #16 | |
add x10, x10, #16 | |
.endif | |
b.gt 0b | |
ret mx | |
endfunc | |
.unreq height | |
.unreq heightw | |
.unreq width | |
.unreq src | |
.unreq dst | |
.unreq srcstride | |
.unreq dststride | |
.unreq mx | |
.endm | |
put_hevc qpel | |
put_hevc qpel_uni | |
put_hevc qpel_bi | |