Re: [PATCH] LoongArch: Micro-optimize sc_save_fcc and sc_restore_fcc for LA464

From: Huacai Chen
Date: Thu Dec 14 2023 - 08:41:11 EST


Emmm, I want to keep the code simpler. :)

Huacai

On Thu, Dec 14, 2023 at 9:02 PM Xi Ruoyao <xry111@xxxxxxxxxxx> wrote:
>
> On LA464 movcf2gr is 7 times slower than movcf2fr + movfr2gr, and
> movgr2cf is 15 times (!) slower than movgr2fr + movfr2cf.
>
> On LA664 movcf2fr + movfr2gr has a similar performance with movcf2gr,
> and movgr2fr + movfr2cf has a similar performance with movgr2cf.
>
> To use FP registers in sc_save_fcc and sc_restore_fcc we need to save
> FP/LSX/LASX registers before sc_save_fcc, and restore FP/LSX/LASX
> registers after sc_restore_fcc.
>
> Signed-off-by: Xi Ruoyao <xry111@xxxxxxxxxxx>
> ---
> arch/loongarch/kernel/fpu.S | 94 +++++++++++++++++++++----------------
> 1 file changed, 54 insertions(+), 40 deletions(-)
>
> diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S
> index d53ab10f4644..ecb127f9a673 100644
> --- a/arch/loongarch/kernel/fpu.S
> +++ b/arch/loongarch/kernel/fpu.S
> @@ -96,43 +96,57 @@
> .endm
>
> .macro sc_save_fcc base, tmp0, tmp1
> - movcf2gr \tmp0, $fcc0
> - move \tmp1, \tmp0
> - movcf2gr \tmp0, $fcc1
> - bstrins.d \tmp1, \tmp0, 15, 8
> - movcf2gr \tmp0, $fcc2
> - bstrins.d \tmp1, \tmp0, 23, 16
> - movcf2gr \tmp0, $fcc3
> - bstrins.d \tmp1, \tmp0, 31, 24
> - movcf2gr \tmp0, $fcc4
> - bstrins.d \tmp1, \tmp0, 39, 32
> - movcf2gr \tmp0, $fcc5
> - bstrins.d \tmp1, \tmp0, 47, 40
> - movcf2gr \tmp0, $fcc6
> - bstrins.d \tmp1, \tmp0, 55, 48
> - movcf2gr \tmp0, $fcc7
> - bstrins.d \tmp1, \tmp0, 63, 56
> - EX st.d \tmp1, \base, 0
> + movcf2fr ft0, $fcc0
> + movcf2fr ft1, $fcc1
> + movfr2gr.s \tmp0, ft0
> + movfr2gr.s \tmp1, ft1
> + EX st.b \tmp0, \base, 0
> + EX st.b \tmp0, \base, 8
> + movcf2fr ft0, $fcc2
> + movcf2fr ft1, $fcc3
> + movfr2gr.s \tmp0, ft0
> + movfr2gr.s \tmp1, ft1
> + EX st.b \tmp0, \base, 16
> + EX st.b \tmp0, \base, 24
> + movcf2fr ft0, $fcc3
> + movcf2fr ft1, $fcc4
> + movfr2gr.s \tmp0, ft0
> + movfr2gr.s \tmp1, ft1
> + EX st.b \tmp0, \base, 32
> + EX st.b \tmp0, \base, 40
> + movcf2fr ft0, $fcc5
> + movcf2fr ft1, $fcc6
> + movfr2gr.s \tmp0, ft0
> + movfr2gr.s \tmp1, ft1
> + EX st.b \tmp0, \base, 48
> + EX st.b \tmp0, \base, 56
> .endm
>
> .macro sc_restore_fcc base, tmp0, tmp1
> - EX ld.d \tmp0, \base, 0
> - bstrpick.d \tmp1, \tmp0, 7, 0
> - movgr2cf $fcc0, \tmp1
> - bstrpick.d \tmp1, \tmp0, 15, 8
> - movgr2cf $fcc1, \tmp1
> - bstrpick.d \tmp1, \tmp0, 23, 16
> - movgr2cf $fcc2, \tmp1
> - bstrpick.d \tmp1, \tmp0, 31, 24
> - movgr2cf $fcc3, \tmp1
> - bstrpick.d \tmp1, \tmp0, 39, 32
> - movgr2cf $fcc4, \tmp1
> - bstrpick.d \tmp1, \tmp0, 47, 40
> - movgr2cf $fcc5, \tmp1
> - bstrpick.d \tmp1, \tmp0, 55, 48
> - movgr2cf $fcc6, \tmp1
> - bstrpick.d \tmp1, \tmp0, 63, 56
> - movgr2cf $fcc7, \tmp1
> + EX ld.b \tmp0, \base, 0
> + EX ld.b \tmp1, \base, 8
> + movgr2fr.w ft0, \tmp0
> + movgr2fr.w ft1, \tmp1
> + movfr2cf $fcc0, ft0
> + movfr2cf $fcc1, ft1
> + EX ld.b \tmp0, \base, 16
> + EX ld.b \tmp1, \base, 24
> + movgr2fr.w ft0, \tmp0
> + movgr2fr.w ft1, \tmp1
> + movfr2cf $fcc2, ft0
> + movfr2cf $fcc3, ft1
> + EX ld.b \tmp0, \base, 32
> + EX ld.b \tmp1, \base, 40
> + movgr2fr.w ft0, \tmp0
> + movgr2fr.w ft1, \tmp1
> + movfr2cf $fcc4, ft0
> + movfr2cf $fcc5, ft1
> + EX ld.b \tmp0, \base, 48
> + EX ld.b \tmp1, \base, 56
> + movgr2fr.w ft0, \tmp0
> + movgr2fr.w ft1, \tmp1
> + movfr2cf $fcc6, ft0
> + movfr2cf $fcc7, ft1
> .endm
>
> .macro sc_save_fcsr base, tmp0
> @@ -449,9 +463,9 @@ SYM_FUNC_END(_init_fpu)
> * a2: fcsr
> */
> SYM_FUNC_START(_save_fp_context)
> - sc_save_fcc a1 t1 t2
> sc_save_fcsr a2 t1
> sc_save_fp a0
> + sc_save_fcc a1 t1 t2
> li.w a0, 0 # success
> jr ra
> SYM_FUNC_END(_save_fp_context)
> @@ -462,8 +476,8 @@ SYM_FUNC_END(_save_fp_context)
> * a2: fcsr
> */
> SYM_FUNC_START(_restore_fp_context)
> - sc_restore_fp a0
> sc_restore_fcc a1 t1 t2
> + sc_restore_fp a0
> sc_restore_fcsr a2 t1
> li.w a0, 0 # success
> jr ra
> @@ -475,9 +489,9 @@ SYM_FUNC_END(_restore_fp_context)
> * a2: fcsr
> */
> SYM_FUNC_START(_save_lsx_context)
> - sc_save_fcc a1, t0, t1
> sc_save_fcsr a2, t0
> sc_save_lsx a0
> + sc_save_fcc a1, t0, t1
> li.w a0, 0 # success
> jr ra
> SYM_FUNC_END(_save_lsx_context)
> @@ -488,8 +502,8 @@ SYM_FUNC_END(_save_lsx_context)
> * a2: fcsr
> */
> SYM_FUNC_START(_restore_lsx_context)
> - sc_restore_lsx a0
> sc_restore_fcc a1, t1, t2
> + sc_restore_lsx a0
> sc_restore_fcsr a2, t1
> li.w a0, 0 # success
> jr ra
> @@ -501,9 +515,9 @@ SYM_FUNC_END(_restore_lsx_context)
> * a2: fcsr
> */
> SYM_FUNC_START(_save_lasx_context)
> - sc_save_fcc a1, t0, t1
> sc_save_fcsr a2, t0
> sc_save_lasx a0
> + sc_save_fcc a1, t0, t1
> li.w a0, 0 # success
> jr ra
> SYM_FUNC_END(_save_lasx_context)
> @@ -514,8 +528,8 @@ SYM_FUNC_END(_save_lasx_context)
> * a2: fcsr
> */
> SYM_FUNC_START(_restore_lasx_context)
> - sc_restore_lasx a0
> sc_restore_fcc a1, t1, t2
> + sc_restore_lasx a0
> sc_restore_fcsr a2, t1
> li.w a0, 0 # success
> jr ra
> --
> 2.43.0
>