Re: [PATCH] radeon: Deinline indirect register accessor functions

From: Christian KÃnig
Date: Mon May 18 2015 - 15:39:35 EST


I'm actually surprised how often people come along with that. The last time we tried this it caused a noticeable performance drop.

Basic problem is that this line:
+ if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
optimizes away in most of the cases which reduces the call to a readl which is way faster than the spinlock path.

So this is a NAK,
Christian.

On 18.05.2015 19:59, Denys Vlasenko wrote:
This patch deinlines indirect register accessor functions.

These functions perform two mmio accesses, framed by spinlock/unlock.
Spin lock/unlock by itself takes more than 50 cycles in ideal case.

With this .config: http://busybox.net/~vda/kernel_config,
after uninlining these functions have sizes and callsite counts
as follows:

r600_uvd_ctx_rreg: 111 bytes, 4 callsites
r600_uvd_ctx_wreg: 113 bytes, 5 callsites
eg_pif_phy0_rreg: 106 bytes, 13 callsites
eg_pif_phy0_wreg: 108 bytes, 13 callsites
eg_pif_phy1_rreg: 107 bytes, 13 callsites
eg_pif_phy1_wreg: 108 bytes, 13 callsites
rv370_pcie_rreg: 111 bytes, 21 callsites
rv370_pcie_wreg: 113 bytes, 24 callsites
r600_rcu_rreg: 111 bytes, 16 callsites
r600_rcu_wreg: 113 bytes, 25 callsites
cik_didt_rreg: 106 bytes, 10 callsites
cik_didt_wreg: 107 bytes, 10 callsites
r100_mm_rreg: 112 bytes, 2083 callsites
r100_mm_wreg: 116 bytes, 3570 callsites
tn_smc_rreg: 106 bytes, 126 callsites
tn_smc_wreg: 107 bytes, 116 callsites
eg_cg_rreg: 107 bytes, 20 callsites
eg_cg_wreg: 108 bytes, 52 callsites

Reduction in code size is more than 80,000 bytes:

text data bss dec hex filename
85740176 22294680 20627456 128662312 7ab3b28 vmlinux.before
85657104 22294872 20627456 128579432 7a9f768 vmlinux

Signed-off-by: Denys Vlasenko <dvlasenk@xxxxxxxxxx>
Cc: Christian KÃnig <christian.koenig@xxxxxxx>
Cc: Alex Deucher <alexander.deucher@xxxxxxx>
Cc: linux-kernel@xxxxxxxxxxxxxxx
---
drivers/gpu/drm/radeon/r100.c | 34 +++++
drivers/gpu/drm/radeon/radeon.h | 229 +++------------------------------
drivers/gpu/drm/radeon/radeon_device.c | 179 ++++++++++++++++++++++++++
3 files changed, 233 insertions(+), 209 deletions(-)

diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
index 04f2514..95868c7 100644
--- a/drivers/gpu/drm/radeon/r100.c
+++ b/drivers/gpu/drm/radeon/r100.c
@@ -4090,6 +4090,40 @@ int r100_init(struct radeon_device *rdev)
return 0;
}
+uint32_t r100_mm_rreg(struct radeon_device *rdev, uint32_t reg,
+ bool always_indirect)
+{
+ /* The mmio size is 64kb at minimum. Allows the if to be optimized out. */
+ if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
+ return readl(((void __iomem *)rdev->rmmio) + reg);
+ else {
+ unsigned long flags;
+ uint32_t ret;
+
+ spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
+ writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
+ ret = readl(((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
+ spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
+
+ return ret;
+ }
+}
+
+void r100_mm_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v,
+ bool always_indirect)
+{
+ if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
+ writel(v, ((void __iomem *)rdev->rmmio) + reg);
+ else {
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
+ writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
+ writel(v, ((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
+ spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
+ }
+}
+
u32 r100_io_rreg(struct radeon_device *rdev, u32 reg)
{
if (reg < rdev->rio_mem_size)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 5587603..bb6b25c 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -2465,39 +2465,10 @@ int radeon_gpu_wait_for_idle(struct radeon_device *rdev);
#define RADEON_MIN_MMIO_SIZE 0x10000
-static inline uint32_t r100_mm_rreg(struct radeon_device *rdev, uint32_t reg,
- bool always_indirect)
-{
- /* The mmio size is 64kb at minimum. Allows the if to be optimized out. */
- if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
- return readl(((void __iomem *)rdev->rmmio) + reg);
- else {
- unsigned long flags;
- uint32_t ret;
-
- spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
- writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
- ret = readl(((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
- spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
-
- return ret;
- }
-}
-
-static inline void r100_mm_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v,
- bool always_indirect)
-{
- if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
- writel(v, ((void __iomem *)rdev->rmmio) + reg);
- else {
- unsigned long flags;
-
- spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
- writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
- writel(v, ((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
- spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
- }
-}
+uint32_t r100_mm_rreg(struct radeon_device *rdev, uint32_t reg,
+ bool always_indirect);
+void r100_mm_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v,
+ bool always_indirect);
u32 r100_io_rreg(struct radeon_device *rdev, u32 reg);
void r100_io_wreg(struct radeon_device *rdev, u32 reg, u32 v);
@@ -2582,182 +2553,22 @@ static inline struct radeon_fence *to_radeon_fence(struct fence *f)
/*
* Indirect registers accessor
*/
-static inline uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg)
-{
- unsigned long flags;
- uint32_t r;
-
- spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
- WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
- r = RREG32(RADEON_PCIE_DATA);
- spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
- return r;
-}
-
-static inline void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
- WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
- WREG32(RADEON_PCIE_DATA, (v));
- spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
-}
-
-static inline u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg)
-{
- unsigned long flags;
- u32 r;
-
- spin_lock_irqsave(&rdev->smc_idx_lock, flags);
- WREG32(TN_SMC_IND_INDEX_0, (reg));
- r = RREG32(TN_SMC_IND_DATA_0);
- spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
- return r;
-}
-
-static inline void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rdev->smc_idx_lock, flags);
- WREG32(TN_SMC_IND_INDEX_0, (reg));
- WREG32(TN_SMC_IND_DATA_0, (v));
- spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
-}
-
-static inline u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg)
-{
- unsigned long flags;
- u32 r;
-
- spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
- WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
- r = RREG32(R600_RCU_DATA);
- spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
- return r;
-}
-
-static inline void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
- WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
- WREG32(R600_RCU_DATA, (v));
- spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
-}
-
-static inline u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg)
-{
- unsigned long flags;
- u32 r;
-
- spin_lock_irqsave(&rdev->cg_idx_lock, flags);
- WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
- r = RREG32(EVERGREEN_CG_IND_DATA);
- spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
- return r;
-}
-
-static inline void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rdev->cg_idx_lock, flags);
- WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
- WREG32(EVERGREEN_CG_IND_DATA, (v));
- spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
-}
-
-static inline u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg)
-{
- unsigned long flags;
- u32 r;
-
- spin_lock_irqsave(&rdev->pif_idx_lock, flags);
- WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
- r = RREG32(EVERGREEN_PIF_PHY0_DATA);
- spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
- return r;
-}
-
-static inline void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rdev->pif_idx_lock, flags);
- WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
- WREG32(EVERGREEN_PIF_PHY0_DATA, (v));
- spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
-}
-
-static inline u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg)
-{
- unsigned long flags;
- u32 r;
-
- spin_lock_irqsave(&rdev->pif_idx_lock, flags);
- WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
- r = RREG32(EVERGREEN_PIF_PHY1_DATA);
- spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
- return r;
-}
-
-static inline void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rdev->pif_idx_lock, flags);
- WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
- WREG32(EVERGREEN_PIF_PHY1_DATA, (v));
- spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
-}
-
-static inline u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg)
-{
- unsigned long flags;
- u32 r;
-
- spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
- WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
- r = RREG32(R600_UVD_CTX_DATA);
- spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
- return r;
-}
-
-static inline void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
- WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
- WREG32(R600_UVD_CTX_DATA, (v));
- spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
-}
-
-
-static inline u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg)
-{
- unsigned long flags;
- u32 r;
-
- spin_lock_irqsave(&rdev->didt_idx_lock, flags);
- WREG32(CIK_DIDT_IND_INDEX, (reg));
- r = RREG32(CIK_DIDT_IND_DATA);
- spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
- return r;
-}
-
-static inline void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rdev->didt_idx_lock, flags);
- WREG32(CIK_DIDT_IND_INDEX, (reg));
- WREG32(CIK_DIDT_IND_DATA, (v));
- spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
-}
+uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg);
+void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v);
+u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg);
+void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg);
+void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg);
+void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg);
+void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg);
+void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg);
+void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg);
+void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v);
void r100_pll_errata_after_index(struct radeon_device *rdev);
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index bd7519f..6712505 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -161,6 +161,185 @@ static void radeon_device_handle_px_quirks(struct radeon_device *rdev)
rdev->flags &= ~RADEON_IS_PX;
}
+/*
+ * Indirect registers accessor
+ */
+uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg)
+{
+ unsigned long flags;
+ uint32_t r;
+
+ spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
+ WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
+ r = RREG32(RADEON_PCIE_DATA);
+ spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
+ return r;
+}
+
+void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
+ WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
+ WREG32(RADEON_PCIE_DATA, (v));
+ spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
+}
+
+u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg)
+{
+ unsigned long flags;
+ u32 r;
+
+ spin_lock_irqsave(&rdev->smc_idx_lock, flags);
+ WREG32(TN_SMC_IND_INDEX_0, (reg));
+ r = RREG32(TN_SMC_IND_DATA_0);
+ spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
+ return r;
+}
+
+void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->smc_idx_lock, flags);
+ WREG32(TN_SMC_IND_INDEX_0, (reg));
+ WREG32(TN_SMC_IND_DATA_0, (v));
+ spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
+}
+
+u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg)
+{
+ unsigned long flags;
+ u32 r;
+
+ spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
+ WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
+ r = RREG32(R600_RCU_DATA);
+ spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
+ return r;
+}
+
+void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
+ WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
+ WREG32(R600_RCU_DATA, (v));
+ spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
+}
+
+u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg)
+{
+ unsigned long flags;
+ u32 r;
+
+ spin_lock_irqsave(&rdev->cg_idx_lock, flags);
+ WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
+ r = RREG32(EVERGREEN_CG_IND_DATA);
+ spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
+ return r;
+}
+
+void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->cg_idx_lock, flags);
+ WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
+ WREG32(EVERGREEN_CG_IND_DATA, (v));
+ spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
+}
+
+u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg)
+{
+ unsigned long flags;
+ u32 r;
+
+ spin_lock_irqsave(&rdev->pif_idx_lock, flags);
+ WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
+ r = RREG32(EVERGREEN_PIF_PHY0_DATA);
+ spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
+ return r;
+}
+
+void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->pif_idx_lock, flags);
+ WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
+ WREG32(EVERGREEN_PIF_PHY0_DATA, (v));
+ spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
+}
+
+u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg)
+{
+ unsigned long flags;
+ u32 r;
+
+ spin_lock_irqsave(&rdev->pif_idx_lock, flags);
+ WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
+ r = RREG32(EVERGREEN_PIF_PHY1_DATA);
+ spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
+ return r;
+}
+
+void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->pif_idx_lock, flags);
+ WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
+ WREG32(EVERGREEN_PIF_PHY1_DATA, (v));
+ spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
+}
+
+u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg)
+{
+ unsigned long flags;
+ u32 r;
+
+ spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
+ WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
+ r = RREG32(R600_UVD_CTX_DATA);
+ spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
+ return r;
+}
+
+void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
+ WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
+ WREG32(R600_UVD_CTX_DATA, (v));
+ spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
+}
+
+u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg)
+{
+ unsigned long flags;
+ u32 r;
+
+ spin_lock_irqsave(&rdev->didt_idx_lock, flags);
+ WREG32(CIK_DIDT_IND_INDEX, (reg));
+ r = RREG32(CIK_DIDT_IND_DATA);
+ spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
+ return r;
+}
+
+void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->didt_idx_lock, flags);
+ WREG32(CIK_DIDT_IND_INDEX, (reg));
+ WREG32(CIK_DIDT_IND_DATA, (v));
+ spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
+}
+
/**
* radeon_program_register_sequence - program an array of registers.
*

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/