[RFC 6/9] apparmor: Initial prototype for optimizing ref switch

From: Neeraj Upadhyay
Date: Wed Jan 10 2024 - 06:25:18 EST


This patches adds a prototype for optimizing the atomic
window, during label scan by switching to an immortal
percpu ref.

Below is the sequence of operations to do this:

1. Ensure that both immortal ref and label ref are in percpu mode.
Reinit the immortal ref in percpu mode.

Swap percpu and atomic counters of label refcount and immortal ref
percpu-ref
+-------------------+
+-------+ | percpu-ctr-addr1 |
| label | --------->|-------------------| +----------------+
+-------+ | data |--->| Atomic counter1|
+-------------------+ +----------------+
+-------+ +-------------------+
|ImmLbl |---------->| percpu-ctr-addr2 | +----------------+
+-------+ |-------------------|--->| Atomic counter2|
| data | +----------------+
+-------------------+

label ->percpu-ctr-addr = percpu-ctr-addr2
ImmLbl ->percpu-ctr-addr = percpu-ctr-addr1
label ->data->count = Atomic counter2
ImmLbl ->data->count = Atomic counter1

2. Check the counters collected in immortal label, by switch it
to atomic mode.

3. If the count is 0, do,
a. Switch immortal counter to percpu again, giving it an
initial count of 1.
b. Swap the label and immortal counters again. The immortal
ref now has the counter values from new percpu ref get
and get operations on the label ref, from the point
when we did the initial swap operation.
c. Transfer the percpu counts in immortal ref to atomic
counter of label percpu refcount.
d. Kill immortal ref, for reinit on next iteration.
e. Switch label percpu ref to atomic mode.
f. If the counter is 1, drop the initial ref.

4. If the count is not 0, terminate the operations and re-swap
the counters.
a. Switch immortal counter to percpu again, giving it an
initial count of 1.
b. Swap the label and immortal counters again. The immortal
ref now has the counter values from new percpu ref get
and get operations on the label ref, from the point
when we did the initial swap operation.
c. Transfer the percpu counts in immortal ref to atomic
counter of label percpu refcount.
d. Kill immortal ref, for reinit on next iteration.

Using this approach, we ensure that, label ref users do not switch
to atomic mode, while there are active references on the label.
However, this approach requires multiple percpu ref mode switches
and adds high overhead and complexity to the scanning code.

Signed-off-by: Neeraj Upadhyay <Neeraj.Upadhyay@xxxxxxx>
---
include/linux/percpu-refcount.h | 2 +
lib/percpu-refcount.c | 93 +++++++++++++++++++++++++++++
security/apparmor/lsm.c | 101 ++++++++++++++++++++++++++++----
3 files changed, 185 insertions(+), 11 deletions(-)

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index d73a1c08c3e3..9e30c458cc00 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -131,6 +131,8 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
void percpu_ref_resurrect(struct percpu_ref *ref);
void percpu_ref_reinit(struct percpu_ref *ref);
bool percpu_ref_is_zero(struct percpu_ref *ref);
+void percpu_ref_swap_percpu_sync(struct percpu_ref *ref1, struct percpu_ref *ref2);
+void percpu_ref_transfer_percpu_count(struct percpu_ref *ref1, struct percpu_ref *ref2);

/**
* percpu_ref_kill - drop the initial ref
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 668f6aa6a75d..36814446db34 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -477,3 +477,96 @@ void percpu_ref_resurrect(struct percpu_ref *ref)
spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_resurrect);
+
+static void percpu_ref_swap_percpu_rcu(struct rcu_head *rcu)
+{
+ struct percpu_ref_data *data = container_of(rcu,
+ struct percpu_ref_data, rcu);
+ struct percpu_ref *ref = data->ref;
+
+ data->confirm_switch(ref);
+ data->confirm_switch = NULL;
+ wake_up_all(&percpu_ref_switch_waitq);
+
+}
+
+static void __percpu_ref_swap_percpu(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch)
+{
+ ref->data->confirm_switch = confirm_switch ?:
+ percpu_ref_noop_confirm_switch;
+ call_rcu_hurry(&ref->data->rcu,
+ percpu_ref_swap_percpu_rcu);
+}
+
+/**
+ * percpuref_swap_percpu_sync - Swap percpu counter of one ref with other
+ * @ref1: First perpcu_ref to swap the counter
+ * @ref2: Second percpu_ref for counter swap
+ */
+void percpu_ref_swap_percpu_sync(struct percpu_ref *ref1, struct percpu_ref *ref2)
+{
+ unsigned long __percpu *percpu_count;
+ unsigned long flags;
+ struct percpu_ref_data *data1 = ref1->data;
+ struct percpu_ref_data *data2 = ref2->data;
+ unsigned long percpu_cnt_ptr1 = ref1->percpu_count_ptr;
+ unsigned long percpu_cnt_ptr2 = ref2->percpu_count_ptr;
+ atomic_long_t count1 = ref1->data->count;
+ atomic_long_t count2 = ref2->data->count;
+
+ spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+ wait_event_lock_irq(percpu_ref_switch_waitq,
+ !data1->confirm_switch && !data2->confirm_switch,
+ percpu_ref_switch_lock);
+ if (!__ref_is_percpu(ref1, &percpu_count) ||
+ !__ref_is_percpu(ref2, &percpu_count)) {
+ spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
+ return;
+ }
+ WRITE_ONCE(ref1->percpu_count_ptr, percpu_cnt_ptr2);
+ WRITE_ONCE(ref2->percpu_count_ptr, percpu_cnt_ptr1);
+
+ __percpu_ref_swap_percpu(ref1, NULL);
+ __percpu_ref_swap_percpu(ref2, NULL);
+ ref1->data->count = count2;
+ ref2->data->count = count1;
+ spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
+ wait_event(percpu_ref_switch_waitq, !ref1->data->confirm_switch &&
+ !ref2->data->confirm_switch);
+}
+
+/**
+ * percpu_ref_transfer_percpu_count - Transfer percpu counts of one ref to other
+ * @ref1: perpcu_ref to transfer the counters to
+ * @ref2: percpu_ref to transfer the counters from
+ *
+ * The per cpu counts of ref2 are transferred to the atomic counter of ref1.
+ * The ref2 is expected to be inactive.
+ */
+void percpu_ref_transfer_percpu_count(struct percpu_ref *ref1, struct percpu_ref *ref2)
+{
+ unsigned long __percpu *percpu_count = percpu_count_ptr(ref2);
+ struct percpu_ref_data *data1 = ref1->data;
+ struct percpu_ref_data *data2 = ref2->data;
+ unsigned long count = 0;
+ unsigned long flags;
+ int cpu;
+
+ spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+ wait_event_lock_irq(percpu_ref_switch_waitq,
+ !data1->confirm_switch && !data2->confirm_switch,
+ percpu_ref_switch_lock);
+
+ if (!__ref_is_percpu(ref1, &percpu_count) ||
+ !__ref_is_percpu(ref2, &percpu_count)) {
+ spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
+ return;
+ }
+
+ for_each_possible_cpu(cpu) {
+ count += *per_cpu_ptr(percpu_count, cpu);
+ *per_cpu_ptr(percpu_count, cpu) = 0;
+ }
+ atomic_long_add((long)count, &ref1->data->count);
+ spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
+}
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index cf8429f5c88e..d0d4ebad1e26 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -92,6 +92,7 @@ static LLIST_HEAD(aa_label_reclaim_head);
static struct llist_node *last_reclaim_label;
static struct aa_label_reclaim_node aa_label_reclaim_nodes[AA_LABEL_RECLAIM_NODE_MAX];
static DECLARE_DELAYED_WORK(aa_label_reclaim_work, aa_label_reclaim_work_fn);
+static struct percpu_ref aa_label_reclaim_ref;

void aa_label_reclaim_add_label(struct aa_label *label)
{
@@ -135,14 +136,18 @@ static void aa_put_all_reclaim_nodes(void)
for (i = 0; i < AA_LABEL_RECLAIM_NODE_MAX; i++)
aa_label_reclaim_nodes[i].inuse = false;
}
+static void aa_release_reclaim_ref_noop(struct percpu_ref *ref)
+{
+}

static void aa_label_reclaim_work_fn(struct work_struct *work)
{
struct llist_node *pos, *first, *head, *prev, *next;
+ static bool reclaim_ref_dead_once;
struct llist_node *reclaim_node;
struct aa_label *label;
int cnt = 0;
- bool held;
+ bool held, ref_is_zero;

first = aa_label_reclaim_head.first;
if (!first)
@@ -178,16 +183,72 @@ static void aa_label_reclaim_work_fn(struct work_struct *work)
}

label = container_of(pos, struct aa_label, reclaim_node);
- percpu_ref_switch_to_atomic_sync(&label->count);
- rcu_read_lock();
- percpu_ref_put(&label->count);
- held = percpu_ref_tryget(&label->count);
- if (!held)
- prev->next = pos->next;
- rcu_read_unlock();
- if (!held)
- continue;
- percpu_ref_switch_to_percpu(&label->count);
+ if (reclaim_ref_dead_once)
+ percpu_ref_reinit(&aa_label_reclaim_ref);
+
+ /*
+ * Switch counters of label ref and reclaim ref.
+ * Label's refcount becomes 1
+ * Percpu refcount has the current refcount value
+ * of the label percpu_ref.
+ */
+ percpu_ref_swap_percpu_sync(&label->count, &aa_label_reclaim_ref);
+
+ /* Switch reclaim ref to percpu, to check for 0 */
+ percpu_ref_switch_to_atomic_sync(&aa_label_reclaim_ref);
+
+ /*
+ * Release a count (original label percpu ref had an extra count,
+ * from the llist addition).
+ * When all percpu references have been released, this should
+ * be the initial count, which gets dropped.
+ */
+ percpu_ref_put(&aa_label_reclaim_ref);
+ /*
+ * Release function of reclaim ref is noop; we store the result
+ * for later processing after common code.
+ */
+ if (percpu_ref_is_zero(&aa_label_reclaim_ref))
+ ref_is_zero = true;
+
+ /*
+ * Restore back initial count. Switch reclaim ref to
+ * percpu, for switching back the label percpu and
+ * atomic counters.
+ */
+ percpu_ref_get(&aa_label_reclaim_ref);
+ percpu_ref_switch_to_percpu(&aa_label_reclaim_ref);
+ /*
+ * Swap the refs again. Label gets all old counts
+ * in its atomic counter after this operation.
+ */
+ percpu_ref_swap_percpu_sync(&label->count, &aa_label_reclaim_ref);
+
+ /*
+ * Transfer the percpu counts, which got added, while this
+ * switch was going on. The counters are accumulated into
+ * the label ref's atomic counter.
+ */
+ percpu_ref_transfer_percpu_count(&label->count, &aa_label_reclaim_ref);
+
+ /* Kill reclaim ref for reinitialization, for next iteration */
+ percpu_ref_kill(&aa_label_reclaim_ref);
+ reclaim_ref_dead_once = true;
+
+ /* If refcount of label ref was found to be 0, reclaim it now! */
+ if (ref_is_zero) {
+ percpu_ref_switch_to_atomic_sync(&label->count);
+ rcu_read_lock();
+ percpu_ref_put(&label->count);
+ held = percpu_ref_tryget(&label->count);
+ if (!held)
+ prev->next = pos->next;
+ rcu_read_unlock();
+ if (!held)
+ continue;
+ percpu_ref_switch_to_percpu(&label->count);
+ }
+
cnt++;
if (cnt == AA_MAX_LABEL_RECLAIMS) {
last_reclaim_label = pos;
@@ -2136,6 +2197,16 @@ static int __init set_init_ctx(void)
return 0;
}

+static int __init clear_init_ctx(void)
+{
+ struct cred *cred = (__force struct cred *)current->real_cred;
+
+ set_cred_label(cred, NULL);
+ aa_put_label(ns_unconfined(root_ns));
+
+ return 0;
+}
+
static void destroy_buffers(void)
{
union aa_buffer *aa_buf;
@@ -2422,6 +2493,14 @@ static int __init apparmor_init(void)
queue_delayed_work(aa_label_reclaim_wq, &aa_label_reclaim_work,
AA_LABEL_RECLAIM_INTERVAL_MS);

+ if (!percpu_ref_init(&aa_label_reclaim_ref, aa_release_reclaim_ref_noop,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+ AA_ERROR("Failed to allocate label reclaim percpu ref\n");
+ aa_free_root_ns();
+ clear_init_ctx();
+ goto buffers_out;
+ }
+
security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks),
&apparmor_lsmid);

--
2.34.1