Skip to content

Commit

Permalink
More optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
sadko4u committed Dec 11, 2024
1 parent 6544567 commit 1e0d004
Show file tree
Hide file tree
Showing 7 changed files with 37 additions and 63 deletions.
88 changes: 31 additions & 57 deletions include/private/dsp/arch/x86/avx512/hmath/hsum.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,18 @@ namespace lsp
/* x128 blocks */
__ASM_EMIT("sub $128, %[count]")
__ASM_EMIT("jb 2f")
__ASM_EMIT("vxorps %%zmm2, %%zmm2, %%zmm2")
__ASM_EMIT("vxorps %%zmm3, %%zmm3, %%zmm3")
__ASM_EMIT("1:")
__ASM_EMIT("vaddps 0x000(%[src]), %%zmm0, %%zmm0")
__ASM_EMIT("vaddps 0x040(%[src]), %%zmm1, %%zmm1")
__ASM_EMIT("vaddps 0x080(%[src]), %%zmm2, %%zmm2")
__ASM_EMIT("vaddps 0x0c0(%[src]), %%zmm3, %%zmm3")
__ASM_EMIT("vaddps 0x080(%[src]), %%zmm0, %%zmm0")
__ASM_EMIT("vaddps 0x0c0(%[src]), %%zmm1, %%zmm1")
__ASM_EMIT("vaddps 0x100(%[src]), %%zmm0, %%zmm0")
__ASM_EMIT("vaddps 0x140(%[src]), %%zmm1, %%zmm1")
__ASM_EMIT("vaddps 0x180(%[src]), %%zmm2, %%zmm2")
__ASM_EMIT("vaddps 0x1c0(%[src]), %%zmm3, %%zmm3")
__ASM_EMIT("vaddps 0x180(%[src]), %%zmm0, %%zmm0")
__ASM_EMIT("vaddps 0x1c0(%[src]), %%zmm1, %%zmm1")
__ASM_EMIT("add $0x200, %[src]")
__ASM_EMIT("sub $128, %[count]")
__ASM_EMIT("jae 1b")
__ASM_EMIT("vaddps %%zmm2, %%zmm0, %%zmm0")
__ASM_EMIT("vaddps %%zmm3, %%zmm1, %%zmm1")
__ASM_EMIT("2:")
/* x32 blocks */
__ASM_EMIT("add $96, %[count]")
Expand All @@ -68,31 +64,25 @@ namespace lsp
__ASM_EMIT("sub $32, %[count]")
__ASM_EMIT("jge 3b")
__ASM_EMIT("4:")
__ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm2")
__ASM_EMIT("vextractf64x4 $1, %%zmm1, %%ymm3")
__ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0")
__ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1")
__ASM_EMIT("vaddps %%zmm1, %%zmm0, %%zmm0")
/* x16 block */
__ASM_EMIT("add $16, %[count]")
__ASM_EMIT("jl 6f")
__ASM_EMIT("vaddps 0x000(%[src]), %%ymm0, %%ymm0")
__ASM_EMIT("vaddps 0x020(%[src]), %%ymm1, %%ymm1")
__ASM_EMIT("vaddps 0x000(%[src]), %%zmm0, %%zmm0")
__ASM_EMIT("add $0x40, %[src]")
__ASM_EMIT("sub $16, %[count]")
__ASM_EMIT("6:")
__ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm2")
__ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm3")
__ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0")
__ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1")
__ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm2")
__ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0")
/* x8 block */
__ASM_EMIT("add $8, %[count]")
__ASM_EMIT("jl 8f")
__ASM_EMIT("vaddps 0x000(%[src]), %%xmm0, %%xmm0")
__ASM_EMIT("vaddps 0x010(%[src]), %%xmm1, %%xmm1")
__ASM_EMIT("vaddps 0x000(%[src]), %%ymm0, %%ymm0")
__ASM_EMIT("add $0x20, %[src]")
__ASM_EMIT("sub $8, %[count]")
__ASM_EMIT("8:")
__ASM_EMIT("vaddps %%xmm1, %%xmm0, %%xmm0")
__ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm2")
__ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0")
/* x4 block */
__ASM_EMIT("add $4, %[count]")
__ASM_EMIT("jl 10f")
Expand All @@ -116,7 +106,7 @@ namespace lsp
[res] "=Yz" (result)
:
: "cc", "memory",
"%xmm1", "%xmm2", "%xmm3"
"%xmm1"
);

return result;
Expand Down Expand Up @@ -169,35 +159,27 @@ namespace lsp
__ASM_EMIT("sub $32, %[count]")
__ASM_EMIT("jge 3b")
__ASM_EMIT("4:")
__ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm2")
__ASM_EMIT("vextractf64x4 $1, %%zmm1, %%ymm3")
__ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0")
__ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1")
__ASM_EMIT("vaddps %%zmm1, %%zmm0, %%zmm0")
/* x16 block */
__ASM_EMIT("add $16, %[count]")
__ASM_EMIT("jl 6f")
__ASM_EMIT("vmovups 0x000(%[src]), %%ymm4")
__ASM_EMIT("vmovups 0x020(%[src]), %%ymm5")
__ASM_EMIT("vfmadd231ps %%ymm4, %%ymm4, %%ymm0")
__ASM_EMIT("vfmadd231ps %%ymm5, %%ymm5, %%ymm1")
__ASM_EMIT("vmovups 0x000(%[src]), %%zmm4")
__ASM_EMIT("vfmadd231ps %%zmm4, %%zmm4, %%zmm0")
__ASM_EMIT("add $0x40, %[src]")
__ASM_EMIT("sub $16, %[count]")
__ASM_EMIT("6:")
__ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm2")
__ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm3")
__ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0")
__ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1")
__ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm2")
__ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0")
/* x8 block */
__ASM_EMIT("add $8, %[count]")
__ASM_EMIT("jl 8f")
__ASM_EMIT("vmovups 0x000(%[src]), %%xmm4")
__ASM_EMIT("vmovups 0x010(%[src]), %%xmm5")
__ASM_EMIT("vfmadd231ps %%xmm4, %%xmm4, %%xmm0")
__ASM_EMIT("vfmadd231ps %%xmm5, %%xmm5, %%xmm1")
__ASM_EMIT("vmovups 0x000(%[src]), %%ymm4")
__ASM_EMIT("vfmadd231ps %%ymm4, %%ymm4, %%ymm0")
__ASM_EMIT("add $0x20, %[src]")
__ASM_EMIT("sub $8, %[count]")
__ASM_EMIT("8:")
__ASM_EMIT("vaddps %%xmm1, %%xmm0, %%xmm0")
__ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm2")
__ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0")
/* x4 block */
__ASM_EMIT("add $4, %[count]")
__ASM_EMIT("jl 10f")
Expand Down Expand Up @@ -244,8 +226,8 @@ namespace lsp
(
__ASM_EMIT("vxorps %%zmm0, %%zmm0, %%zmm0")
__ASM_EMIT("vmovaps %[CC], %%zmm6")
__ASM_EMIT("vxorps %%zmm1, %%zmm1, %%zmm1")
__ASM_EMIT("vmovaps %%zmm6, %%zmm7")
__ASM_EMIT("vxorps %%zmm1, %%zmm1, %%zmm1")
/* x128 blocks */
__ASM_EMIT("sub $128, %[count]")
__ASM_EMIT("jb 2f")
Expand Down Expand Up @@ -282,35 +264,27 @@ namespace lsp
__ASM_EMIT("sub $32, %[count]")
__ASM_EMIT("jge 3b")
__ASM_EMIT("4:")
__ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm2")
__ASM_EMIT("vextractf64x4 $1, %%zmm1, %%ymm3")
__ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0")
__ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1")
__ASM_EMIT("vaddps %%zmm1, %%zmm0, %%zmm0")
/* x16 block */
__ASM_EMIT("add $16, %[count]")
__ASM_EMIT("jl 6f")
__ASM_EMIT("vandps 0x000(%[src]), %%ymm6, %%ymm2")
__ASM_EMIT("vandps 0x020(%[src]), %%ymm7, %%ymm3")
__ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0")
__ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1")
__ASM_EMIT("vandps 0x000(%[src]), %%zmm6, %%zmm2")
__ASM_EMIT("vaddps %%zmm2, %%zmm0, %%zmm0")
__ASM_EMIT("add $0x40, %[src]")
__ASM_EMIT("sub $16, %[count]")
__ASM_EMIT("6:")
__ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm2")
__ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm3")
__ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0")
__ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1")
__ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm2")
__ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0")
/* x8 block */
__ASM_EMIT("add $8, %[count]")
__ASM_EMIT("jl 8f")
__ASM_EMIT("vandps 0x000(%[src]), %%xmm6, %%xmm2")
__ASM_EMIT("vandps 0x010(%[src]), %%xmm7, %%xmm3")
__ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0")
__ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1")
__ASM_EMIT("vandps 0x000(%[src]), %%ymm6, %%ymm2")
__ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0")
__ASM_EMIT("add $0x20, %[src]")
__ASM_EMIT("sub $8, %[count]")
__ASM_EMIT("8:")
__ASM_EMIT("vaddps %%xmm1, %%xmm0, %%xmm0")
__ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm2")
__ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0")
/* x4 block */
__ASM_EMIT("add $4, %[count]")
__ASM_EMIT("jl 10f")
Expand Down
2 changes: 1 addition & 1 deletion src/test/ptest/hmath/h_abs_dotp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ namespace lsp
typedef float (* h_dotp_t)(const float *a, const float *b, size_t count);
}

PTEST_BEGIN("dsp.hmath", h_abs_dotp, 5, 5000)
PTEST_BEGIN("dsp.hmath", h_abs_dotp, 2, 10000)

void call(const char *label, float *a, float *b, size_t count, h_dotp_t func)
{
Expand Down
2 changes: 1 addition & 1 deletion src/test/ptest/hmath/h_abs_sum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ namespace lsp
typedef float (* h_sum_t)(const float *src, size_t count);
}

PTEST_BEGIN("dsp.hmath", h_abs_sum, 5, 5000)
PTEST_BEGIN("dsp.hmath", h_abs_sum, 2, 10000)

void call(const char *label, float *src, size_t count, h_sum_t func)
{
Expand Down
2 changes: 1 addition & 1 deletion src/test/ptest/hmath/h_dotp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ namespace lsp
typedef float (* h_dotp_t)(const float *a, const float *b, size_t count);
}

PTEST_BEGIN("dsp.hmath", h_dotp, 5, 5000)
PTEST_BEGIN("dsp.hmath", h_dotp, 2, 10000)

void call(const char *label, float *a, float *b, size_t count, h_dotp_t func)
{
Expand Down
2 changes: 1 addition & 1 deletion src/test/ptest/hmath/h_sqr_dotp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ namespace lsp
typedef float (* h_dotp_t)(const float *a, const float *b, size_t count);
}

PTEST_BEGIN("dsp.hmath", h_sqr_dotp, 5, 5000)
PTEST_BEGIN("dsp.hmath", h_sqr_dotp, 2, 10000)

void call(const char *label, float *a, float *b, size_t count, h_dotp_t func)
{
Expand Down
2 changes: 1 addition & 1 deletion src/test/ptest/hmath/h_sqr_sum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ namespace lsp
typedef float (* h_sum_t)(const float *src, size_t count);
}

PTEST_BEGIN("dsp.hmath", h_sqr_sum, 5, 5000)
PTEST_BEGIN("dsp.hmath", h_sqr_sum, 2, 10000)

void call(const char *label, float *src, size_t count, h_sum_t func)
{
Expand Down
2 changes: 1 addition & 1 deletion src/test/ptest/hmath/h_sum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ namespace lsp
typedef float (* h_sum_t)(const float *src, size_t count);
}

PTEST_BEGIN("dsp.hmath", h_sum, 5, 5000)
PTEST_BEGIN("dsp.hmath", h_sum, 2, 10000)

void call(const char *label, float *src, size_t count, h_sum_t func)
{
Expand Down

0 comments on commit 1e0d004

Please sign in to comment.