[PATCH 3/3] ipc/shm: locking / questions

From: Manfred Spraul
Date: Thu Jul 22 2021 - 14:19:47 EST


for discussion, not for merging:
- make locking a bit more explicit
- there is no advantage of a temporary list in exit_shm(),
thus delete segments one by one.
- lots of debug pr_info.

Signed-off-by: Manfred Spraul <manfred@xxxxxxxxxxxxxxxx>
---
include/linux/ipc_namespace.h | 16 ++++
ipc/namespace.c | 4 +
ipc/shm.c | 160 +++++++++++++++++++++-------------
3 files changed, 121 insertions(+), 59 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 05e22770af51..f3a09604d0d5 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -126,11 +126,22 @@ extern struct ipc_namespace *copy_ipcs(unsigned long flags,

static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
{
+pr_info("get_ipc_ns: inc for ns %px.\n", ns);
if (ns)
refcount_inc(&ns->ns.count);
return ns;
}

+static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
+{
+pr_info("get_ipc_ns_not_zero: inc for ns %px.\n", ns);
+ if (ns) {
+ if (refcount_inc_not_zero(&ns->ns.count))
+ return ns;
+ }
+ return NULL;
+}
+
extern void put_ipc_ns(struct ipc_namespace *ns);
#else
static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
@@ -147,6 +158,11 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
return ns;
}

+static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
+{
+ return ns;
+}
+
static inline void put_ipc_ns(struct ipc_namespace *ns)
{
}
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 7bd0766ddc3b..4160ea18dcd2 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -117,6 +117,7 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,

static void free_ipc_ns(struct ipc_namespace *ns)
{
+pr_info("free_ipc_ns: worker task for namespace %px.\n", ns);
/* mq_put_mnt() waits for a grace period as kern_unmount()
* uses synchronize_rcu().
*/
@@ -129,6 +130,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
kfree(ns);
+pr_info("free_ipc_ns: worker task for namespace %px done.\n", ns);
}

static LLIST_HEAD(free_ipc_list);
@@ -164,9 +166,11 @@ static DECLARE_WORK(free_ipc_work, free_ipc);
*/
void put_ipc_ns(struct ipc_namespace *ns)
{
+pr_info("put_ipc_ns: got called for %px.\n", ns);
if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) {
mq_clear_sbinfo(ns);
spin_unlock(&mq_lock);
+pr_info("put_ipc_ns: destroying namespace %px.\n", ns);

if (llist_add(&ns->mnt_llist, &free_ipc_list))
schedule_work(&free_ipc_work);
diff --git a/ipc/shm.c b/ipc/shm.c
index a746886a0e00..f55118d0a425 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -62,9 +62,14 @@ struct shmid_kernel /* private to the kernel */
struct pid *shm_lprid;
struct ucounts *mlock_ucounts;

- /* The task created the shm object. NULL if the task is dead. */
+ /* The task created the shm object, for looking up
+ * task->sysvshm.shm_clist_lock */
struct task_struct *shm_creator;
- struct list_head shm_clist; /* list by creator */
+
+ /* list by creator. shm_clist_lock required for read/write
+ * if list_empty(), then the creator is dead already
+ */
+ struct list_head shm_clist;
struct ipc_namespace *ns;
} __randomize_layout;

@@ -130,6 +135,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
#ifdef CONFIG_IPC_NS
void shm_exit_ns(struct ipc_namespace *ns)
{
+pr_info("shm_exit_ns(), %px.\n", ns);
free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
rhashtable_destroy(&ns->ids[IPC_SHM_IDS].key_ht);
@@ -237,15 +243,30 @@ static inline void task_shm_clist_unlock(struct task_struct *task)
spin_unlock(&task->sysvshm.shm_clist_lock);
}

-void shm_clist_rm(struct shmid_kernel *shp)
+/* It has to be called with shp locked.
+ * It must be called before ipc_rmid() */
+static inline void shm_clist_rm(struct shmid_kernel *shp)
{
- if (!shp->shm_creator)
+ struct task_struct *creator;
+
+ creator = READ_ONCE(shp->shm_creator);
+ if (!creator) {
+pr_info("shm_clist_rm: creator already NULL for %px.\n", shp);
return;
+ }
+
+ task_shm_clist_lock(creator);
+
+pr_info("shm_clist_rm: creator %px locked for shp %px.\n", creator, shp);

- task_shm_clist_lock(shp->shm_creator);
- list_del_init(&shp->shm_clist);
- task_shm_clist_unlock(shp->shm_creator);
- shp->shm_creator = NULL;
+ /* A concurrent exit_shm may do a list_del_init() as well.
+ * Just do nothing if exit_shm already did the work
+ */
+ if (!list_empty(&shp->shm_clist)) {
+ list_del_init(&shp->shm_clist);
+ WRITE_ONCE(shp->shm_creator, NULL);
+ }
+ task_shm_clist_unlock(creator);
}

static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
@@ -305,6 +326,8 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
{
struct file *shm_file;

+pr_info("shm_destroy: current %px, namespace %px, shp %px.\n", current, ns, shp);
+
shm_file = shp->shm_file;
shp->shm_file = NULL;
ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -333,6 +356,11 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
*/
static bool shm_may_destroy(struct shmid_kernel *shp)
{
+pr_info("shm_may_destroy for current %px, ns %px, shp %px.\n", current, shp->ns, shp);
+
+/* TODO:
+ * understand refcounting for shp->ns: What guarantees that ns cannot go out of scope?
+ */
return (shp->shm_nattch == 0) &&
(shp->ns->shm_rmid_forced ||
(shp->shm_perm.mode & SHM_DEST));
@@ -365,6 +393,7 @@ static void shm_close(struct vm_area_struct *vma)
ipc_update_pid(&shp->shm_lprid, task_tgid(current));
shp->shm_dtim = ktime_get_real_seconds();
shp->shm_nattch--;
+pr_info("shm_close for current %px, ns %px, shp %px.\n", current, shp->ns, shp);
if (shm_may_destroy(shp))
shm_destroy(ns, shp);
else
@@ -413,67 +442,66 @@ void shm_init_task(struct task_struct *task)
/* Locking assumes this will only be called with task == current */
void exit_shm(struct task_struct *task)
{
- LIST_HEAD(tmp);
- struct shmid_kernel *shp, *n;

- if (list_empty(&task->sysvshm.shm_clist))
- return;
+pr_info("exit_shm: called for task %px, current %px\n", task, current);

- rcu_read_lock(); /* for refcount_inc_not_zero */
- task_shm_clist_lock(task);
+ for (;;) {
+ struct shmid_kernel *shp;
+ struct ipc_namespace *ns;

- list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
- struct ipc_namespace *ns = shp->ns;

- /*
- * Remove shm from task list and nullify shm_creator which
- * marks object as orphaned.
- *
- * If kernel.shm_rmid_forced is not set then only keep track of
- * which shmids are orphaned, so that a later set of the sysctl
- * can clean them up.
- */
- list_del_init(&shp->shm_clist);
- shp->shm_creator = NULL;
+ task_shm_clist_lock(task);
+ if (list_empty(&task->sysvshm.shm_clist)) {
+ task_shm_clist_unlock(task);
+ break;
+ }

- printk("exit_shm() %px refcnt=%u, id=%d,key=%x\n", shp,
- refcount_read(&shp->shm_perm.refcount),
- shp->shm_perm.id, shp->shm_perm.key
- );
+ shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel,
+ shm_clist);
+
+ /* 1) unlink */
+ list_del_init(&shp->shm_clist);
+ WRITE_ONCE(shp->shm_creator, NULL);

/*
- * Will destroy all already created segments, that were not yet mapped,
- * and mark any mapped as orphan to cover the sysctl toggling.
- * Destroy is skipped if shm_may_destroy() returns false.
+ * 2) get a reference to the namespace.
+ * The refcount could be already 0. If it is 0, then
+ * the shm objects will be free by free_ipc_work().
*/
- if (shp->ns->shm_rmid_forced && shm_may_destroy(shp)) {
+ ns = shp->ns;
+pr_info("exit_shm: called for task %px, current %px, processing shp %px, ns %px\n", task, current, shp, ns);
+
+ ns = get_ipc_ns_not_zero(ns);
+ if (ns) {
/*
- * We may race with shm_exit_ns() if refcounter
- * already zero. Let's skip shm_destroy() of such
- * shm object as it will be destroyed during shm_exit_ns()
+ * 3) get a reference to the shp itself.
+ * This cannot fail: shm_clist_rm() is called before
+ * ipc_rmid(), thus the refcount cannot be 0.
*/
- if (!refcount_inc_not_zero(&ns->ns.count)) /* get_ipc_ns */
- continue;
-
- list_add(&shp->shm_clist, &tmp);
+ ipc_rcu_getref(&shp->shm_perm);
+ }
+ task_shm_clist_unlock(task);
+
+ if (ns) {
+ down_write(&shm_ids(ns).rwsem);
+ shm_lock_by_ptr(shp);
+ ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
+
+ if (ipc_valid_object(&shp->shm_perm)) {
+ if (shm_may_destroy(shp))
+ shm_destroy(ns, shp);
+ else
+ shm_unlock(shp);
+ } else {
+ /*
+ * Someone else deleted the shp while we have waited.
+ * Just unlock and continue.
+ */
+ shm_unlock(shp);
+ }
+ up_write(&shm_ids(ns).rwsem);
+ put_ipc_ns(ns);
}
- }
-
- list_del(&task->sysvshm.shm_clist);
- task_shm_clist_unlock(task);
- rcu_read_unlock();
-
- list_for_each_entry_safe(shp, n, &tmp, shm_clist) {
- struct ipc_namespace *ns = shp->ns;
-
- list_del_init(&shp->shm_clist);
-
- down_write(&shm_ids(ns).rwsem);
- shm_lock_by_ptr(shp);
- /* will do put_ipc_ns(shp->ns) */
- shm_destroy(ns, shp);
- up_write(&shm_ids(ns).rwsem);
- put_ipc_ns(ns); /* see refcount_inc_not_zero */
}
}

@@ -566,6 +594,8 @@ static int shm_release(struct inode *ino, struct file *file)
{
struct shm_file_data *sfd = shm_file_data(file);

+pr_info("shm_release for current %px.\n", current);
+
put_ipc_ns(sfd->ns);
fput(sfd->file);
shm_file_data(file) = NULL;
@@ -731,9 +761,21 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
if (error < 0)
goto no_id;

- list_add(&shp->shm_clist, &current->sysvshm.shm_clist);
shp->ns = ns;

+ task_shm_clist_lock(current);
+ list_add(&shp->shm_clist, &current->sysvshm.shm_clist);
+
+pr_info("newseg: current %px, namespace %px, shp %px.\n", current, ns, shp);
+ {
+ struct shmid_kernel *shp;
+
+ list_for_each_entry(shp, &current->sysvshm.shm_clist, shm_clist) {
+pr_info("newseg: current %px, list entry %px.\n", current, shp);
+ }
+ }
+ task_shm_clist_unlock(current);
+
/*
* shmid gets reported as "inode#" in /proc/pid/maps.
* proc-ps tools use this. Changing this will break them.
--
2.31.1


--------------E806D47D8E247ABC3FEA7CB2--