[PATCH 41/41] afs: Fix offline and busy handling

From: David Howells
Date: Thu Nov 09 2023 - 10:44:17 EST


The current code assumes offline and busy volume states apply to all
instances of a volume, not just the one on the server that returned
VOFFLINE or VBUSY.

Fix that by moving the flags recording this to the afs_server_entry struct
that is used to represent a particular instance of a volume on a specific
server.

Further, add a sleep for when we have iterated through all the servers so
that we don't keep poking the server every few milliseconds.

Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
cc: Marc Dionne <marc.dionne@xxxxxxxxxxxx>
cc: linux-afs@xxxxxxxxxxxxxxxxxxx
---
fs/afs/internal.h | 7 ++++---
fs/afs/rotate.c | 40 ++++++++++++++++++++++++--------------
include/trace/events/afs.h | 1 +
3 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index d1031241d11b..eb59b0487f8b 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -603,6 +603,9 @@ struct afs_server_entry {
struct afs_volume *volume;
struct list_head slink; /* Link in server->volumes */
time64_t cb_expires_at; /* Time at which volume-level callback expires */
+ unsigned long flags;
+#define AFS_SE_VOLUME_OFFLINE 0 /* Set if volume offline notice given */
+#define AFS_SE_VOLUME_BUSY 1 /* Set if volume busy notice given */
};

struct afs_server_list {
@@ -636,9 +639,7 @@ struct afs_volume {
#define AFS_VOLUME_UPDATING 1 /* - T if an update is in progress */
#define AFS_VOLUME_WAIT 2 /* - T if users must wait for update */
#define AFS_VOLUME_DELETED 3 /* - T if volume appears deleted */
-#define AFS_VOLUME_OFFLINE 4 /* - T if volume offline notice given */
-#define AFS_VOLUME_BUSY 5 /* - T if volume busy notice given */
-#define AFS_VOLUME_MAYBE_NO_IBULK 6 /* - T if some servers don't have InlineBulkStatus */
+#define AFS_VOLUME_MAYBE_NO_IBULK 4 /* - T if some servers don't have InlineBulkStatus */
#ifdef CONFIG_AFS_FSCACHE
struct fscache_volume *cache; /* Caching cookie */
#endif
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 3f656dcb0adf..cb0ab1c2c401 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -111,7 +111,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
/*
* Post volume busy note.
*/
-static void afs_busy(struct afs_volume *volume, u32 abort_code)
+static void afs_busy(struct afs_operation *op, u32 abort_code)
{
const char *m;

@@ -122,7 +122,8 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code)
default: m = "busy"; break;
}

- pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
+ pr_notice("kAFS: Volume %llu '%s' on server %pU is %s\n",
+ op->volume->vid, op->volume->name, &op->server->uuid, m);
}

/*
@@ -130,6 +131,7 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code)
*/
static bool afs_sleep_and_retry(struct afs_operation *op)
{
+ trace_afs_rotate(op, afs_rotate_trace_busy_sleep, 0);
if (!(op->flags & AFS_OPERATION_UNINTR)) {
msleep_interruptible(1000);
if (signal_pending(current)) {
@@ -180,6 +182,10 @@ bool afs_select_fileserver(struct afs_operation *op)
/* Evaluate the result of the previous operation, if there was one. */
switch (op->call_error) {
case 0:
+ clear_bit(AFS_SE_VOLUME_OFFLINE,
+ &op->server_list->servers[op->server_index].flags);
+ clear_bit(AFS_SE_VOLUME_BUSY,
+ &op->server_list->servers[op->server_index].flags);
op->cumul_error.responded = true;
fallthrough;
default:
@@ -297,18 +303,16 @@ bool afs_select_fileserver(struct afs_operation *op)
* expected to come back but it might take a long time (could be
* days).
*/
- if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) {
- afs_busy(op->volume, abort_code);
- clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
+ if (!test_and_set_bit(AFS_SE_VOLUME_OFFLINE,
+ &op->server_list->servers[op->server_index].flags)) {
+ afs_busy(op, abort_code);
+ clear_bit(AFS_SE_VOLUME_BUSY,
+ &op->server_list->servers[op->server_index].flags);
}
if (op->flags & AFS_OPERATION_NO_VSLEEP) {
afs_op_set_error(op, -EADV);
goto failed;
}
- if (op->flags & AFS_OPERATION_CUR_ONLY) {
- afs_op_set_error(op, -ESTALE);
- goto failed;
- }
goto busy;

case VRESTARTING: /* The fileserver is either shutting down or starting up. */
@@ -329,9 +333,11 @@ bool afs_select_fileserver(struct afs_operation *op)
afs_op_set_error(op, -EBUSY);
goto failed;
}
- if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) {
- afs_busy(op->volume, abort_code);
- clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
+ if (!test_and_set_bit(AFS_SE_VOLUME_BUSY,
+ &op->server_list->servers[op->server_index].flags)) {
+ afs_busy(op, abort_code);
+ clear_bit(AFS_SE_VOLUME_OFFLINE,
+ &op->server_list->servers[op->server_index].flags);
}
busy:
if (op->flags & AFS_OPERATION_CUR_ONLY) {
@@ -411,8 +417,10 @@ bool afs_select_fileserver(struct afs_operation *op)
default:
afs_op_accumulate_error(op, error, abort_code);
failed_but_online:
- clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
- clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
+ clear_bit(AFS_SE_VOLUME_OFFLINE,
+ &op->server_list->servers[op->server_index].flags);
+ clear_bit(AFS_SE_VOLUME_BUSY,
+ &op->server_list->servers[op->server_index].flags);
goto failed;
}

@@ -640,8 +648,10 @@ bool afs_select_fileserver(struct afs_operation *op)
* of them were busy.
*/
trace_afs_rotate(op, afs_rotate_trace_no_more_servers, 0);
- if (op->flags & AFS_OPERATION_VBUSY)
+ if (op->flags & AFS_OPERATION_VBUSY) {
+ afs_sleep_and_retry(op);
goto restart_from_beginning;
+ }

rcu_read_lock();
for (i = 0; i < op->server_list->nr_servers; i++) {
diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
index 804d9e147314..ac50fa687429 100644
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -446,6 +446,7 @@ enum yfs_cm_operation {

#define afs_rotate_traces \
EM(afs_rotate_trace_aborted, "Abortd") \
+ EM(afs_rotate_trace_busy_sleep, "BsySlp") \
EM(afs_rotate_trace_check_vol_status, "VolStt") \
EM(afs_rotate_trace_failed, "Failed") \
EM(afs_rotate_trace_iter, "Iter ") \