pnfs: handle partial layout recalls

layout status flags for PNFS_LAYOUT_RECALLED and PNFS_LAYOUT_CHANGED are removed, and replaced by a list of recalled ranges.  recalls during io are added to the list and processed on pnfs_layout_io_finished().  recalls outside of io are processed immediately

new function layout_recall_range() loops through all existing layout segments, and removes ranges that intersect with the range recalled.  deals with 4 cases per segment:
-only the beginning of the segment is recalled
-only the end of the segment is recalled
-the entire segment is recalled
-only a middle part of the segment is recalled

new function pnfs_layout_recall_status() is called before each unit of io, allowing io threads to bail out early if a recall is detected.  takes a layout segment as an argument, and only returns an error if that segment intersects a recalled range

new function pnfs_layout_recall_fenced() is called when map_ds_error() in pnfs_io.c detects fencing.  also takes a layout segment as an argument, and appends a recall matching the range of the segment

pnfs_layout_state_prepare() now checks the given range against the list of recalled ranges

Signed-off-by: Casey Bodley <cbodley@citi.umich.edu>
This commit is contained in:
Casey Bodley 2012-01-30 13:49:40 -05:00 committed by unknown
parent d0ff37a195
commit db7caebe28
3 changed files with 277 additions and 71 deletions

View file

@ -84,13 +84,6 @@ enum pnfs_iomode {
}; };
enum pnfs_layout_status { enum pnfs_layout_status {
/* CB_LAYOUTRECALL indicated that the server has recalled this layout,
* and it should be returned on completion of any pending io */
PNFS_LAYOUT_RECALLED = 0x04,
/* CB_LAYOUTRECALL indicated that the layout is changing, and "the client
* SHOULD NOT write and commit modified data to the storage devices!" */
PNFS_LAYOUT_CHANGED = 0x08,
/* a LAYOUTGET error indicated that this layout will never be granted */ /* a LAYOUTGET error indicated that this layout will never be granted */
PNFS_LAYOUT_UNAVAILABLE = 0x10, PNFS_LAYOUT_UNAVAILABLE = 0x10,
/* LAYOUTGET returned BADIOMODE, so a RW layout will never be granted */ /* LAYOUTGET returned BADIOMODE, so a RW layout will never be granted */
@ -158,6 +151,7 @@ typedef struct __pnfs_layout_state {
stateid4 stateid; stateid4 stateid;
struct list_entry entry; /* position in nfs41_client.layouts */ struct list_entry entry; /* position in nfs41_client.layouts */
struct list_entry layouts; /* list of pnfs_file_layouts */ struct list_entry layouts; /* list of pnfs_file_layouts */
struct list_entry recalls; /* list of pnfs_layouts */
enum pnfs_layout_status status; enum pnfs_layout_status status;
bool_t return_on_close; bool_t return_on_close;
LONG open_count; /* for return on last close */ LONG open_count; /* for return on last close */
@ -190,21 +184,6 @@ typedef struct __pnfs_file_layout {
uint32_t util; uint32_t util;
} pnfs_file_layout; } pnfs_file_layout;
typedef struct __pnfs_layout_recall {
enum pnfs_layout_type type;
enum pnfs_iomode iomode;
bool_t changed;
enum pnfs_return_type recall;
union {
struct {
nfs41_fh fh;
stateid4 stateid;
} file;
nfs41_fsid fsid;
} args;
} pnfs_layout_recall;
/* pnfs_layout.c */ /* pnfs_layout.c */
struct pnfs_layout_list; struct pnfs_layout_list;
@ -239,6 +218,15 @@ enum pnfs_status pnfs_file_layout_recall(
IN struct __nfs41_client *client, IN struct __nfs41_client *client,
IN const struct cb_layoutrecall_args *recall); IN const struct cb_layoutrecall_args *recall);
/* expects caller to hold a shared lock on pnfs_layout_state */
enum pnfs_status pnfs_layout_recall_status(
IN const pnfs_layout_state *state,
IN const pnfs_layout *layout);
void pnfs_layout_recall_fenced(
IN pnfs_layout_state *state,
IN const pnfs_layout *layout);
/* expects caller to hold an exclusive lock on pnfs_layout_state */ /* expects caller to hold an exclusive lock on pnfs_layout_state */
void pnfs_layout_io_start( void pnfs_layout_io_start(
IN pnfs_layout_state *state); IN pnfs_layout_state *state);

View file

@ -332,19 +332,14 @@ static enum pnfs_status thread_next_unit(
{ {
pnfs_io_pattern *pattern = thread->pattern; pnfs_io_pattern *pattern = thread->pattern;
pnfs_layout_state *state = pattern->state; pnfs_layout_state *state = pattern->state;
enum pnfs_status status = PNFS_SUCCESS; enum pnfs_status status;
AcquireSRWLockShared(&state->lock); AcquireSRWLockShared(&state->lock);
/* stop io if the layout is recalled */ /* stop io if the layout is recalled */
if (state->status & PNFS_LAYOUT_CHANGED) { status = pnfs_layout_recall_status(state, &thread->layout->layout);
status = PNFSERR_LAYOUT_CHANGED; if (status)
goto out_unlock; goto out_unlock;
}
if (state->status & PNFS_LAYOUT_RECALLED) {
status = PNFSERR_LAYOUT_RECALLED;
goto out_unlock;
}
status = stripe_next_unit(thread->layout, thread->id, status = stripe_next_unit(thread->layout, thread->id,
&thread->offset, pattern->offset_end, io); &thread->offset, pattern->offset_end, io);
@ -462,7 +457,8 @@ static uint64_t pattern_bytes_transferred(
static enum pnfs_status map_ds_error( static enum pnfs_status map_ds_error(
IN enum nfsstat4 nfsstat, IN enum nfsstat4 nfsstat,
IN pnfs_layout_state *state) IN pnfs_layout_state *state,
IN const pnfs_file_layout *layout)
{ {
switch (nfsstat) { switch (nfsstat) {
case NO_ERROR: case NO_ERROR:
@ -477,10 +473,7 @@ static enum pnfs_status map_ds_error(
case NFS4ERR_PNFS_NO_LAYOUT: case NFS4ERR_PNFS_NO_LAYOUT:
dprintf(IOLVL, "data server fencing detected!\n"); dprintf(IOLVL, "data server fencing detected!\n");
AcquireSRWLockExclusive(&state->lock); pnfs_layout_recall_fenced(state, &layout->layout);
/* flag the layout for return once io is finished */
state->status |= PNFS_LAYOUT_RECALLED | PNFS_LAYOUT_CHANGED;
ReleaseSRWLockExclusive(&state->lock);
/* return CHANGED to prevent any further use of the layout */ /* return CHANGED to prevent any further use of the layout */
return PNFSERR_LAYOUT_CHANGED; return PNFSERR_LAYOUT_CHANGED;
@ -535,7 +528,7 @@ static uint32_t WINAPI file_layout_read_thread(void *args)
if (nfsstat) { if (nfsstat) {
eprintf("nfs41_read() failed with %s\n", eprintf("nfs41_read() failed with %s\n",
nfs_error_string(nfsstat)); nfs_error_string(nfsstat));
status = map_ds_error(nfsstat, pattern->state); status = map_ds_error(nfsstat, pattern->state, thread->layout);
break; break;
} }
@ -610,7 +603,7 @@ retry_write:
if (nfsstat) { if (nfsstat) {
eprintf("nfs41_write() failed with %s\n", eprintf("nfs41_write() failed with %s\n",
nfs_error_string(nfsstat)); nfs_error_string(nfsstat));
status = map_ds_error(nfsstat, pattern->state); status = map_ds_error(nfsstat, pattern->state, thread->layout);
break; break;
} }
if (!verify_write(&verf, &thread->stable)) if (!verify_write(&verf, &thread->stable))
@ -645,7 +638,7 @@ retry_write:
commit_min, (uint32_t)(commit_max - commit_min), 0, &verf, NULL); commit_min, (uint32_t)(commit_max - commit_min), 0, &verf, NULL);
if (nfsstat) if (nfsstat)
status = map_ds_error(nfsstat, pattern->state); status = map_ds_error(nfsstat, pattern->state, thread->layout);
else if (!verify_commit(&verf)) { else if (!verify_commit(&verf)) {
/* resend the writes unless the layout was recalled */ /* resend the writes unless the layout was recalled */
if (status != PNFSERR_LAYOUT_RECALLED) if (status != PNFSERR_LAYOUT_RECALLED)

View file

@ -55,6 +55,7 @@ static enum pnfs_status layout_state_create(
fh_copy(&layout->meta_fh, meta_fh); fh_copy(&layout->meta_fh, meta_fh);
list_init(&layout->layouts); list_init(&layout->layouts);
list_init(&layout->recalls);
InitializeSRWLock(&layout->lock); InitializeSRWLock(&layout->lock);
InitializeConditionVariable(&layout->cond); InitializeConditionVariable(&layout->cond);
@ -80,10 +81,20 @@ static void layout_state_free_layouts(
list_init(&state->layouts); list_init(&state->layouts);
} }
static void layout_state_free_recalls(
IN pnfs_layout_state *state)
{
struct list_entry *entry, *tmp;
list_for_each_tmp(entry, tmp, &state->recalls)
free(layout_entry(entry));
list_init(&state->recalls);
}
static void layout_state_free( static void layout_state_free(
IN pnfs_layout_state *state) IN pnfs_layout_state *state)
{ {
layout_state_free_layouts(state); layout_state_free_layouts(state);
layout_state_free_recalls(state);
free(state); free(state);
} }
@ -702,14 +713,19 @@ enum pnfs_status pnfs_layout_state_prepare(
IN uint64_t length) IN uint64_t length)
{ {
unsigned char deviceid[PNFS_DEVICEID_SIZE]; unsigned char deviceid[PNFS_DEVICEID_SIZE];
struct list_entry *entry;
uint64_t missing; uint64_t missing;
enum pnfs_status status; enum pnfs_status status;
/* check for layout recall */ /* fail if the range intersects any pending recalls */
if (state->status & PNFS_LAYOUT_RECALLED) { list_for_each(entry, &state->recalls) {
const pnfs_layout *recall = layout_entry(entry);
if (offset <= recall->offset + recall->length
&& recall->offset <= offset + length) {
status = PNFSERR_LAYOUT_RECALLED; status = PNFSERR_LAYOUT_RECALLED;
goto out; goto out;
} }
}
/* if part of the given range is not covered by a layout, /* if part of the given range is not covered by a layout,
* attempt to fetch it with LAYOUTGET */ * attempt to fetch it with LAYOUTGET */
@ -849,26 +865,175 @@ void pnfs_layout_state_close(
/* pnfs_layout_recall */ /* pnfs_layout_recall */
struct layout_recall {
pnfs_layout layout;
bool_t changed;
};
#define recall_entry(pos) list_container(pos, struct layout_recall, layout.entry)
/* expects the caller to have an exclusive lock */ static bool_t layout_recall_compatible(
static void layout_recall_return( IN const pnfs_layout *layout,
IN const pnfs_layout *recall)
{
return layout->type == recall->type
&& layout->offset <= (recall->offset + recall->length)
&& recall->offset <= (layout->offset + layout->length)
&& (recall->iomode == PNFS_IOMODE_ANY ||
layout->iomode == recall->iomode);
}
static pnfs_file_layout* layout_allocate_copy(
IN const pnfs_file_layout *existing)
{
/* allocate a segment to cover the end of the range */
pnfs_file_layout *layout = calloc(1, sizeof(pnfs_file_layout));
if (layout == NULL)
goto out;
memcpy(layout, existing, sizeof(pnfs_file_layout));
/* XXX: don't use the device from existing layout;
* we need to get a reference for ourselves */
layout->device = NULL;
/* allocate a copy of the filehandle array */
layout->filehandles.arr = calloc(layout->filehandles.count,
sizeof(nfs41_path_fh));
if (layout->filehandles.arr == NULL)
goto out_free;
memcpy(layout->filehandles.arr, existing->filehandles.arr,
layout->filehandles.count * sizeof(nfs41_path_fh));
out:
return layout;
out_free:
file_layout_free(layout);
layout = NULL;
goto out;
}
static void layout_recall_range(
IN pnfs_layout_state *state,
IN const pnfs_layout *recall)
{
struct list_entry *entry, *tmp;
list_for_each_tmp(entry, tmp, &state->layouts) {
pnfs_file_layout *layout = file_layout_entry(entry);
const uint64_t layout_end = layout->layout.offset + layout->layout.length;
if (!layout_recall_compatible(&layout->layout, recall))
continue;
if (recall->offset > layout->layout.offset) {
/* segment starts before recall; shrink length */
layout->layout.length = recall->offset - layout->layout.offset;
if (layout_end > recall->offset + recall->length) {
/* middle chunk of the segment is recalled;
* allocate a new segment to cover the end */
pnfs_file_layout *remainder = layout_allocate_copy(layout);
if (remainder == NULL) {
/* silently ignore allocation errors here. behave
* as if we 'forgot' this last segment */
} else {
layout->layout.offset = recall->offset + recall->length;
layout->layout.length = layout_end - layout->layout.offset;
layout_ordered_insert(state, &remainder->layout);
}
}
} else {
/* segment starts after recall */
if (layout_end <= recall->offset + recall->length) {
/* entire segment is recalled */
list_remove(&layout->layout.entry);
file_layout_free(layout);
} else {
/* beginning of segment is recalled; shrink offset/length */
layout->layout.offset = recall->offset + recall->length;
layout->layout.length = layout_end - layout->layout.offset;
}
}
}
}
static void layout_state_deferred_recalls(
IN pnfs_layout_state *state) IN pnfs_layout_state *state)
{ {
dprintf(FLLVL, "layout_recall_return() 'forgetting' layout\n"); struct list_entry *entry, *tmp;
list_for_each_tmp(entry, tmp, &state->recalls) {
/* process each deferred layout recall */
pnfs_layout *recall = layout_entry(entry);
layout_recall_range(state, recall);
layout_state_free_layouts(state); /* remove/free the recall entry */
list_remove(&recall->entry);
free(recall);
}
}
/* since we're forgetful, we don't actually return the layout; static void layout_recall_entry_init(
* just zero the stateid since it won't be valid anymore */ OUT struct layout_recall *lrc,
ZeroMemory(&state->stateid, sizeof(state->stateid)); IN const struct cb_layoutrecall_args *recall)
state->status = 0; {
list_init(&lrc->layout.entry);
if (recall->recall.type == PNFS_RETURN_FILE) {
lrc->layout.offset = recall->recall.args.file.offset;
lrc->layout.length = recall->recall.args.file.length;
} else {
lrc->layout.offset = 0;
lrc->layout.length = NFS4_UINT64_MAX;
}
lrc->layout.iomode = recall->iomode;
lrc->layout.type = PNFS_LAYOUTTYPE_FILE;
lrc->changed = recall->changed;
}
static enum pnfs_status layout_recall_merge(
IN struct list_entry *list,
IN pnfs_layout *from)
{
struct list_entry *entry, *tmp;
enum pnfs_status status = PNFSERR_NO_LAYOUT;
/* attempt to merge the new recall with each existing recall */
list_for_each_tmp(entry, tmp, list) {
pnfs_layout *to = layout_entry(entry);
const uint64_t to_max = to->offset + to->length;
const uint64_t from_max = from->offset + from->length;
/* the ranges must meet or overlap */
if (to_max < from->offset || from_max < to->offset)
continue;
/* the following fields must match: */
if (to->iomode != from->iomode || to->type != from->type)
continue;
dprintf(FLLVL, "merging recalled range {%llu, %llu} with {%llu, %llu}\n",
to->offset, to->length, from->offset, from->length);
/* calculate the union of the two ranges */
to->offset = min(to->offset, from->offset);
to->length = max(to_max, from_max) - to->offset;
/* on success, remove/free the new segment */
list_remove(&from->entry);
free(from);
status = PNFS_SUCCESS;
/* because the existing segment 'to' has grown, we may
* be able to merge it with later segments */
from = to;
}
return status;
} }
static enum pnfs_status file_layout_recall( static enum pnfs_status file_layout_recall(
IN pnfs_layout_state *state, IN pnfs_layout_state *state,
IN const struct cb_layoutrecall_args *recall) IN const struct cb_layoutrecall_args *recall)
{ {
const stateid4 *stateid_arg = &recall->recall.args.file.stateid; const stateid4 *stateid = &recall->recall.args.file.stateid;
enum pnfs_status status = PNFS_SUCCESS; enum pnfs_status status = PNFS_SUCCESS;
/* under an exclusive lock, flag the layout as recalled */ /* under an exclusive lock, flag the layout as recalled */
@ -877,26 +1042,42 @@ static enum pnfs_status file_layout_recall(
if (state->stateid.seqid == 0) { if (state->stateid.seqid == 0) {
/* return NOMATCHINGLAYOUT if it wasn't actually granted */ /* return NOMATCHINGLAYOUT if it wasn't actually granted */
status = PNFSERR_NO_LAYOUT; status = PNFSERR_NO_LAYOUT;
} else if (recall->recall.type == PNFS_RETURN_FILE goto out;
&& stateid_arg->seqid > state->stateid.seqid + 1) {
/* the server has processed an outstanding LAYOUTGET or LAYOUTRETURN;
* we must return ERR_DELAY until we get the response and update our
* view of the layout */
status = PNFS_PENDING;
} else if (state->io_count) {
/* flag the layout as recalled so it can be returned after io */
state->status |= PNFS_LAYOUT_RECALLED;
if (recall->changed)
state->status |= PNFS_LAYOUT_CHANGED;
/* if we got a stateid, update the layout's seqid */
if (recall->recall.type == PNFS_RETURN_FILE)
state->stateid.seqid = stateid_arg->seqid;
} else {
/* if there is no pending io, return the layout now */
layout_recall_return(state);
} }
if (recall->recall.type == PNFS_RETURN_FILE) {
/* detect races between CB_LAYOUTRECALL and LAYOUTGET/LAYOUTRETURN */
if (stateid->seqid > state->stateid.seqid + 1) {
/* the server has processed an outstanding LAYOUTGET or
* LAYOUTRETURN; we must return ERR_DELAY until we get the
* response and update our view of the layout */
status = PNFS_PENDING;
goto out;
}
/* save the updated seqid */
state->stateid.seqid = stateid->seqid;
}
if (state->io_count) {
/* save an entry for this recall, and process it once io finishes */
struct layout_recall *lrc = calloc(1, sizeof(struct layout_recall));
if (lrc == NULL) {
/* on failure to allocate, we'll have to respond
* to the CB_LAYOUTRECALL with NFS4ERR_DELAY */
status = PNFS_PENDING;
goto out;
}
layout_recall_entry_init(lrc, recall);
if (layout_recall_merge(&state->recalls, &lrc->layout) != PNFS_SUCCESS)
list_add_tail(&state->recalls, &lrc->layout.entry);
} else {
/* if there is no pending io, process the recall immediately */
struct layout_recall lrc = { 0 };
layout_recall_entry_init(&lrc, recall);
layout_recall_range(state, &lrc.layout);
}
out:
ReleaseSRWLockExclusive(&state->lock); ReleaseSRWLockExclusive(&state->lock);
return status; return status;
} }
@ -1026,6 +1207,51 @@ out:
return status; return status;
} }
/* expects caller to hold a shared lock on pnfs_layout_state */
enum pnfs_status pnfs_layout_recall_status(
IN const pnfs_layout_state *state,
IN const pnfs_layout *layout)
{
struct list_entry *entry;
enum pnfs_status status = PNFS_SUCCESS;
/* search for a pending recall that intersects with the given segment */
list_for_each(entry, &state->recalls) {
const struct layout_recall *recall = recall_entry(entry);
if (!layout_recall_compatible(layout, &recall->layout))
continue;
if (recall->changed)
status = PNFSERR_LAYOUT_CHANGED;
else
status = PNFSERR_LAYOUT_RECALLED;
break;
}
return status;
}
void pnfs_layout_recall_fenced(
IN pnfs_layout_state *state,
IN const pnfs_layout *layout)
{
struct layout_recall *lrc = calloc(1, sizeof(struct layout_recall));
if (lrc == NULL)
return;
AcquireSRWLockExclusive(&state->lock);
list_init(&lrc->layout.entry);
lrc->layout.offset = layout->offset;
lrc->layout.length = layout->length;
lrc->layout.iomode = layout->iomode;
lrc->layout.type = layout->type;
lrc->changed = TRUE;
if (layout_recall_merge(&state->recalls, &lrc->layout) != PNFS_SUCCESS)
list_add_tail(&state->recalls, &lrc->layout.entry);
ReleaseSRWLockExclusive(&state->lock);
}
/* expects caller to hold an exclusive lock on pnfs_layout_state */ /* expects caller to hold an exclusive lock on pnfs_layout_state */
void pnfs_layout_io_start( void pnfs_layout_io_start(
@ -1051,9 +1277,8 @@ void pnfs_layout_io_finished(
if (state->io_count > 0) /* more io pending */ if (state->io_count > 0) /* more io pending */
goto out_unlock; goto out_unlock;
/* once all io is finished, check for layout recalls */ /* once all io is finished, process any layout recalls */
if (state->status & PNFS_LAYOUT_RECALLED) layout_state_deferred_recalls(state);
layout_recall_return(state);
/* finish any segment merging that was delayed during io */ /* finish any segment merging that was delayed during io */
if (!list_empty(&state->layouts)) if (!list_empty(&state->layouts))