From db7caebe2863f0405cb57d717384ca309ff322ec Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Mon, 30 Jan 2012 13:49:40 -0500 Subject: [PATCH] pnfs: handle partial layout recalls layout status flags for PNFS_LAYOUT_RECALLED and PNFS_LAYOUT_CHANGED are removed, and replaced by a list of recalled ranges. recalls during io are added to the list and processed on pnfs_layout_io_finished(). recalls outside of io are processed immediately new function layout_recall_range() loops through all existing layout segments, and removes ranges that intersect with the range recalled. deals with 4 cases per segment: -only the beginning of the segment is recalled -only the end of the segment is recalled -the entire segment is recalled -only a middle part of the segment is recalled new function pnfs_layout_recall_status() is called before each unit of io, allowing io threads to bail out early if a recall is detected. takes a layout segment as an argument, and only returns an error if that segment intersects a recalled range new function pnfs_layout_recall_fenced() is called when map_ds_error() in pnfs_io.c detects fencing. also takes a layout segment as an argument, and appends a recall matching the range of the segment pnfs_layout_state_prepare() now checks the given range against the list of recalled ranges Signed-off-by: Casey Bodley --- daemon/pnfs.h | 32 ++--- daemon/pnfs_io.c | 25 ++-- daemon/pnfs_layout.c | 291 ++++++++++++++++++++++++++++++++++++++----- 3 files changed, 277 insertions(+), 71 deletions(-) diff --git a/daemon/pnfs.h b/daemon/pnfs.h index 329f9e7..a63fac0 100644 --- a/daemon/pnfs.h +++ b/daemon/pnfs.h @@ -84,13 +84,6 @@ enum pnfs_iomode { }; enum pnfs_layout_status { - /* CB_LAYOUTRECALL indicated that the server has recalled this layout, - * and it should be returned on completion of any pending io */ - PNFS_LAYOUT_RECALLED = 0x04, - /* CB_LAYOUTRECALL indicated that the layout is changing, and "the client - * SHOULD NOT write and commit modified data to the storage devices!" */ - PNFS_LAYOUT_CHANGED = 0x08, - /* a LAYOUTGET error indicated that this layout will never be granted */ PNFS_LAYOUT_UNAVAILABLE = 0x10, /* LAYOUTGET returned BADIOMODE, so a RW layout will never be granted */ @@ -158,6 +151,7 @@ typedef struct __pnfs_layout_state { stateid4 stateid; struct list_entry entry; /* position in nfs41_client.layouts */ struct list_entry layouts; /* list of pnfs_file_layouts */ + struct list_entry recalls; /* list of pnfs_layouts */ enum pnfs_layout_status status; bool_t return_on_close; LONG open_count; /* for return on last close */ @@ -190,21 +184,6 @@ typedef struct __pnfs_file_layout { uint32_t util; } pnfs_file_layout; -typedef struct __pnfs_layout_recall { - enum pnfs_layout_type type; - enum pnfs_iomode iomode; - bool_t changed; - - enum pnfs_return_type recall; - union { - struct { - nfs41_fh fh; - stateid4 stateid; - } file; - nfs41_fsid fsid; - } args; -} pnfs_layout_recall; - /* pnfs_layout.c */ struct pnfs_layout_list; @@ -239,6 +218,15 @@ enum pnfs_status pnfs_file_layout_recall( IN struct __nfs41_client *client, IN const struct cb_layoutrecall_args *recall); +/* expects caller to hold a shared lock on pnfs_layout_state */ +enum pnfs_status pnfs_layout_recall_status( + IN const pnfs_layout_state *state, + IN const pnfs_layout *layout); + +void pnfs_layout_recall_fenced( + IN pnfs_layout_state *state, + IN const pnfs_layout *layout); + /* expects caller to hold an exclusive lock on pnfs_layout_state */ void pnfs_layout_io_start( IN pnfs_layout_state *state); diff --git a/daemon/pnfs_io.c b/daemon/pnfs_io.c index ac2f68c..b9efebe 100644 --- a/daemon/pnfs_io.c +++ b/daemon/pnfs_io.c @@ -332,19 +332,14 @@ static enum pnfs_status thread_next_unit( { pnfs_io_pattern *pattern = thread->pattern; pnfs_layout_state *state = pattern->state; - enum pnfs_status status = PNFS_SUCCESS; + enum pnfs_status status; AcquireSRWLockShared(&state->lock); /* stop io if the layout is recalled */ - if (state->status & PNFS_LAYOUT_CHANGED) { - status = PNFSERR_LAYOUT_CHANGED; + status = pnfs_layout_recall_status(state, &thread->layout->layout); + if (status) goto out_unlock; - } - if (state->status & PNFS_LAYOUT_RECALLED) { - status = PNFSERR_LAYOUT_RECALLED; - goto out_unlock; - } status = stripe_next_unit(thread->layout, thread->id, &thread->offset, pattern->offset_end, io); @@ -462,7 +457,8 @@ static uint64_t pattern_bytes_transferred( static enum pnfs_status map_ds_error( IN enum nfsstat4 nfsstat, - IN pnfs_layout_state *state) + IN pnfs_layout_state *state, + IN const pnfs_file_layout *layout) { switch (nfsstat) { case NO_ERROR: @@ -477,10 +473,7 @@ static enum pnfs_status map_ds_error( case NFS4ERR_PNFS_NO_LAYOUT: dprintf(IOLVL, "data server fencing detected!\n"); - AcquireSRWLockExclusive(&state->lock); - /* flag the layout for return once io is finished */ - state->status |= PNFS_LAYOUT_RECALLED | PNFS_LAYOUT_CHANGED; - ReleaseSRWLockExclusive(&state->lock); + pnfs_layout_recall_fenced(state, &layout->layout); /* return CHANGED to prevent any further use of the layout */ return PNFSERR_LAYOUT_CHANGED; @@ -535,7 +528,7 @@ static uint32_t WINAPI file_layout_read_thread(void *args) if (nfsstat) { eprintf("nfs41_read() failed with %s\n", nfs_error_string(nfsstat)); - status = map_ds_error(nfsstat, pattern->state); + status = map_ds_error(nfsstat, pattern->state, thread->layout); break; } @@ -610,7 +603,7 @@ retry_write: if (nfsstat) { eprintf("nfs41_write() failed with %s\n", nfs_error_string(nfsstat)); - status = map_ds_error(nfsstat, pattern->state); + status = map_ds_error(nfsstat, pattern->state, thread->layout); break; } if (!verify_write(&verf, &thread->stable)) @@ -645,7 +638,7 @@ retry_write: commit_min, (uint32_t)(commit_max - commit_min), 0, &verf, NULL); if (nfsstat) - status = map_ds_error(nfsstat, pattern->state); + status = map_ds_error(nfsstat, pattern->state, thread->layout); else if (!verify_commit(&verf)) { /* resend the writes unless the layout was recalled */ if (status != PNFSERR_LAYOUT_RECALLED) diff --git a/daemon/pnfs_layout.c b/daemon/pnfs_layout.c index 7fb5bba..e3390f6 100644 --- a/daemon/pnfs_layout.c +++ b/daemon/pnfs_layout.c @@ -55,6 +55,7 @@ static enum pnfs_status layout_state_create( fh_copy(&layout->meta_fh, meta_fh); list_init(&layout->layouts); + list_init(&layout->recalls); InitializeSRWLock(&layout->lock); InitializeConditionVariable(&layout->cond); @@ -80,10 +81,20 @@ static void layout_state_free_layouts( list_init(&state->layouts); } +static void layout_state_free_recalls( + IN pnfs_layout_state *state) +{ + struct list_entry *entry, *tmp; + list_for_each_tmp(entry, tmp, &state->recalls) + free(layout_entry(entry)); + list_init(&state->recalls); +} + static void layout_state_free( IN pnfs_layout_state *state) { layout_state_free_layouts(state); + layout_state_free_recalls(state); free(state); } @@ -702,13 +713,18 @@ enum pnfs_status pnfs_layout_state_prepare( IN uint64_t length) { unsigned char deviceid[PNFS_DEVICEID_SIZE]; + struct list_entry *entry; uint64_t missing; enum pnfs_status status; - /* check for layout recall */ - if (state->status & PNFS_LAYOUT_RECALLED) { - status = PNFSERR_LAYOUT_RECALLED; - goto out; + /* fail if the range intersects any pending recalls */ + list_for_each(entry, &state->recalls) { + const pnfs_layout *recall = layout_entry(entry); + if (offset <= recall->offset + recall->length + && recall->offset <= offset + length) { + status = PNFSERR_LAYOUT_RECALLED; + goto out; + } } /* if part of the given range is not covered by a layout, @@ -849,26 +865,175 @@ void pnfs_layout_state_close( /* pnfs_layout_recall */ +struct layout_recall { + pnfs_layout layout; + bool_t changed; +}; +#define recall_entry(pos) list_container(pos, struct layout_recall, layout.entry) -/* expects the caller to have an exclusive lock */ -static void layout_recall_return( +static bool_t layout_recall_compatible( + IN const pnfs_layout *layout, + IN const pnfs_layout *recall) +{ + return layout->type == recall->type + && layout->offset <= (recall->offset + recall->length) + && recall->offset <= (layout->offset + layout->length) + && (recall->iomode == PNFS_IOMODE_ANY || + layout->iomode == recall->iomode); +} + +static pnfs_file_layout* layout_allocate_copy( + IN const pnfs_file_layout *existing) +{ + /* allocate a segment to cover the end of the range */ + pnfs_file_layout *layout = calloc(1, sizeof(pnfs_file_layout)); + if (layout == NULL) + goto out; + + memcpy(layout, existing, sizeof(pnfs_file_layout)); + + /* XXX: don't use the device from existing layout; + * we need to get a reference for ourselves */ + layout->device = NULL; + + /* allocate a copy of the filehandle array */ + layout->filehandles.arr = calloc(layout->filehandles.count, + sizeof(nfs41_path_fh)); + if (layout->filehandles.arr == NULL) + goto out_free; + + memcpy(layout->filehandles.arr, existing->filehandles.arr, + layout->filehandles.count * sizeof(nfs41_path_fh)); +out: + return layout; + +out_free: + file_layout_free(layout); + layout = NULL; + goto out; +} + +static void layout_recall_range( + IN pnfs_layout_state *state, + IN const pnfs_layout *recall) +{ + struct list_entry *entry, *tmp; + list_for_each_tmp(entry, tmp, &state->layouts) { + pnfs_file_layout *layout = file_layout_entry(entry); + const uint64_t layout_end = layout->layout.offset + layout->layout.length; + + if (!layout_recall_compatible(&layout->layout, recall)) + continue; + + if (recall->offset > layout->layout.offset) { + /* segment starts before recall; shrink length */ + layout->layout.length = recall->offset - layout->layout.offset; + + if (layout_end > recall->offset + recall->length) { + /* middle chunk of the segment is recalled; + * allocate a new segment to cover the end */ + pnfs_file_layout *remainder = layout_allocate_copy(layout); + if (remainder == NULL) { + /* silently ignore allocation errors here. behave + * as if we 'forgot' this last segment */ + } else { + layout->layout.offset = recall->offset + recall->length; + layout->layout.length = layout_end - layout->layout.offset; + layout_ordered_insert(state, &remainder->layout); + } + } + } else { + /* segment starts after recall */ + if (layout_end <= recall->offset + recall->length) { + /* entire segment is recalled */ + list_remove(&layout->layout.entry); + file_layout_free(layout); + } else { + /* beginning of segment is recalled; shrink offset/length */ + layout->layout.offset = recall->offset + recall->length; + layout->layout.length = layout_end - layout->layout.offset; + } + } + } +} + +static void layout_state_deferred_recalls( IN pnfs_layout_state *state) { - dprintf(FLLVL, "layout_recall_return() 'forgetting' layout\n"); + struct list_entry *entry, *tmp; + list_for_each_tmp(entry, tmp, &state->recalls) { + /* process each deferred layout recall */ + pnfs_layout *recall = layout_entry(entry); + layout_recall_range(state, recall); - layout_state_free_layouts(state); + /* remove/free the recall entry */ + list_remove(&recall->entry); + free(recall); + } +} - /* since we're forgetful, we don't actually return the layout; - * just zero the stateid since it won't be valid anymore */ - ZeroMemory(&state->stateid, sizeof(state->stateid)); - state->status = 0; +static void layout_recall_entry_init( + OUT struct layout_recall *lrc, + IN const struct cb_layoutrecall_args *recall) +{ + list_init(&lrc->layout.entry); + if (recall->recall.type == PNFS_RETURN_FILE) { + lrc->layout.offset = recall->recall.args.file.offset; + lrc->layout.length = recall->recall.args.file.length; + } else { + lrc->layout.offset = 0; + lrc->layout.length = NFS4_UINT64_MAX; + } + lrc->layout.iomode = recall->iomode; + lrc->layout.type = PNFS_LAYOUTTYPE_FILE; + lrc->changed = recall->changed; +} + +static enum pnfs_status layout_recall_merge( + IN struct list_entry *list, + IN pnfs_layout *from) +{ + struct list_entry *entry, *tmp; + enum pnfs_status status = PNFSERR_NO_LAYOUT; + + /* attempt to merge the new recall with each existing recall */ + list_for_each_tmp(entry, tmp, list) { + pnfs_layout *to = layout_entry(entry); + const uint64_t to_max = to->offset + to->length; + const uint64_t from_max = from->offset + from->length; + + /* the ranges must meet or overlap */ + if (to_max < from->offset || from_max < to->offset) + continue; + + /* the following fields must match: */ + if (to->iomode != from->iomode || to->type != from->type) + continue; + + dprintf(FLLVL, "merging recalled range {%llu, %llu} with {%llu, %llu}\n", + to->offset, to->length, from->offset, from->length); + + /* calculate the union of the two ranges */ + to->offset = min(to->offset, from->offset); + to->length = max(to_max, from_max) - to->offset; + + /* on success, remove/free the new segment */ + list_remove(&from->entry); + free(from); + status = PNFS_SUCCESS; + + /* because the existing segment 'to' has grown, we may + * be able to merge it with later segments */ + from = to; + } + return status; } static enum pnfs_status file_layout_recall( IN pnfs_layout_state *state, IN const struct cb_layoutrecall_args *recall) { - const stateid4 *stateid_arg = &recall->recall.args.file.stateid; + const stateid4 *stateid = &recall->recall.args.file.stateid; enum pnfs_status status = PNFS_SUCCESS; /* under an exclusive lock, flag the layout as recalled */ @@ -877,26 +1042,42 @@ static enum pnfs_status file_layout_recall( if (state->stateid.seqid == 0) { /* return NOMATCHINGLAYOUT if it wasn't actually granted */ status = PNFSERR_NO_LAYOUT; - } else if (recall->recall.type == PNFS_RETURN_FILE - && stateid_arg->seqid > state->stateid.seqid + 1) { - /* the server has processed an outstanding LAYOUTGET or LAYOUTRETURN; - * we must return ERR_DELAY until we get the response and update our - * view of the layout */ - status = PNFS_PENDING; - } else if (state->io_count) { - /* flag the layout as recalled so it can be returned after io */ - state->status |= PNFS_LAYOUT_RECALLED; - if (recall->changed) - state->status |= PNFS_LAYOUT_CHANGED; + goto out; + } + + if (recall->recall.type == PNFS_RETURN_FILE) { + /* detect races between CB_LAYOUTRECALL and LAYOUTGET/LAYOUTRETURN */ + if (stateid->seqid > state->stateid.seqid + 1) { + /* the server has processed an outstanding LAYOUTGET or + * LAYOUTRETURN; we must return ERR_DELAY until we get the + * response and update our view of the layout */ + status = PNFS_PENDING; + goto out; + } - /* if we got a stateid, update the layout's seqid */ - if (recall->recall.type == PNFS_RETURN_FILE) - state->stateid.seqid = stateid_arg->seqid; - } else { - /* if there is no pending io, return the layout now */ - layout_recall_return(state); + /* save the updated seqid */ + state->stateid.seqid = stateid->seqid; } + if (state->io_count) { + /* save an entry for this recall, and process it once io finishes */ + struct layout_recall *lrc = calloc(1, sizeof(struct layout_recall)); + if (lrc == NULL) { + /* on failure to allocate, we'll have to respond + * to the CB_LAYOUTRECALL with NFS4ERR_DELAY */ + status = PNFS_PENDING; + goto out; + } + layout_recall_entry_init(lrc, recall); + if (layout_recall_merge(&state->recalls, &lrc->layout) != PNFS_SUCCESS) + list_add_tail(&state->recalls, &lrc->layout.entry); + } else { + /* if there is no pending io, process the recall immediately */ + struct layout_recall lrc = { 0 }; + layout_recall_entry_init(&lrc, recall); + layout_recall_range(state, &lrc.layout); + } +out: ReleaseSRWLockExclusive(&state->lock); return status; } @@ -1026,6 +1207,51 @@ out: return status; } +/* expects caller to hold a shared lock on pnfs_layout_state */ +enum pnfs_status pnfs_layout_recall_status( + IN const pnfs_layout_state *state, + IN const pnfs_layout *layout) +{ + struct list_entry *entry; + enum pnfs_status status = PNFS_SUCCESS; + + /* search for a pending recall that intersects with the given segment */ + list_for_each(entry, &state->recalls) { + const struct layout_recall *recall = recall_entry(entry); + if (!layout_recall_compatible(layout, &recall->layout)) + continue; + + if (recall->changed) + status = PNFSERR_LAYOUT_CHANGED; + else + status = PNFSERR_LAYOUT_RECALLED; + break; + } + return status; +} + +void pnfs_layout_recall_fenced( + IN pnfs_layout_state *state, + IN const pnfs_layout *layout) +{ + struct layout_recall *lrc = calloc(1, sizeof(struct layout_recall)); + if (lrc == NULL) + return; + + AcquireSRWLockExclusive(&state->lock); + + list_init(&lrc->layout.entry); + lrc->layout.offset = layout->offset; + lrc->layout.length = layout->length; + lrc->layout.iomode = layout->iomode; + lrc->layout.type = layout->type; + lrc->changed = TRUE; + + if (layout_recall_merge(&state->recalls, &lrc->layout) != PNFS_SUCCESS) + list_add_tail(&state->recalls, &lrc->layout.entry); + + ReleaseSRWLockExclusive(&state->lock); +} /* expects caller to hold an exclusive lock on pnfs_layout_state */ void pnfs_layout_io_start( @@ -1051,9 +1277,8 @@ void pnfs_layout_io_finished( if (state->io_count > 0) /* more io pending */ goto out_unlock; - /* once all io is finished, check for layout recalls */ - if (state->status & PNFS_LAYOUT_RECALLED) - layout_recall_return(state); + /* once all io is finished, process any layout recalls */ + layout_state_deferred_recalls(state); /* finish any segment merging that was delayed during io */ if (!list_empty(&state->layouts))