pnfs: handle partial layout recalls
layout status flags for PNFS_LAYOUT_RECALLED and PNFS_LAYOUT_CHANGED are removed, and replaced by a list of recalled ranges. recalls during io are added to the list and processed on pnfs_layout_io_finished(). recalls outside of io are processed immediately new function layout_recall_range() loops through all existing layout segments, and removes ranges that intersect with the range recalled. deals with 4 cases per segment: -only the beginning of the segment is recalled -only the end of the segment is recalled -the entire segment is recalled -only a middle part of the segment is recalled new function pnfs_layout_recall_status() is called before each unit of io, allowing io threads to bail out early if a recall is detected. takes a layout segment as an argument, and only returns an error if that segment intersects a recalled range new function pnfs_layout_recall_fenced() is called when map_ds_error() in pnfs_io.c detects fencing. also takes a layout segment as an argument, and appends a recall matching the range of the segment pnfs_layout_state_prepare() now checks the given range against the list of recalled ranges Signed-off-by: Casey Bodley <cbodley@citi.umich.edu>
This commit is contained in:
parent
d0ff37a195
commit
db7caebe28
3 changed files with 277 additions and 71 deletions
|
|
@ -84,13 +84,6 @@ enum pnfs_iomode {
|
|||
};
|
||||
|
||||
enum pnfs_layout_status {
|
||||
/* CB_LAYOUTRECALL indicated that the server has recalled this layout,
|
||||
* and it should be returned on completion of any pending io */
|
||||
PNFS_LAYOUT_RECALLED = 0x04,
|
||||
/* CB_LAYOUTRECALL indicated that the layout is changing, and "the client
|
||||
* SHOULD NOT write and commit modified data to the storage devices!" */
|
||||
PNFS_LAYOUT_CHANGED = 0x08,
|
||||
|
||||
/* a LAYOUTGET error indicated that this layout will never be granted */
|
||||
PNFS_LAYOUT_UNAVAILABLE = 0x10,
|
||||
/* LAYOUTGET returned BADIOMODE, so a RW layout will never be granted */
|
||||
|
|
@ -158,6 +151,7 @@ typedef struct __pnfs_layout_state {
|
|||
stateid4 stateid;
|
||||
struct list_entry entry; /* position in nfs41_client.layouts */
|
||||
struct list_entry layouts; /* list of pnfs_file_layouts */
|
||||
struct list_entry recalls; /* list of pnfs_layouts */
|
||||
enum pnfs_layout_status status;
|
||||
bool_t return_on_close;
|
||||
LONG open_count; /* for return on last close */
|
||||
|
|
@ -190,21 +184,6 @@ typedef struct __pnfs_file_layout {
|
|||
uint32_t util;
|
||||
} pnfs_file_layout;
|
||||
|
||||
typedef struct __pnfs_layout_recall {
|
||||
enum pnfs_layout_type type;
|
||||
enum pnfs_iomode iomode;
|
||||
bool_t changed;
|
||||
|
||||
enum pnfs_return_type recall;
|
||||
union {
|
||||
struct {
|
||||
nfs41_fh fh;
|
||||
stateid4 stateid;
|
||||
} file;
|
||||
nfs41_fsid fsid;
|
||||
} args;
|
||||
} pnfs_layout_recall;
|
||||
|
||||
|
||||
/* pnfs_layout.c */
|
||||
struct pnfs_layout_list;
|
||||
|
|
@ -239,6 +218,15 @@ enum pnfs_status pnfs_file_layout_recall(
|
|||
IN struct __nfs41_client *client,
|
||||
IN const struct cb_layoutrecall_args *recall);
|
||||
|
||||
/* expects caller to hold a shared lock on pnfs_layout_state */
|
||||
enum pnfs_status pnfs_layout_recall_status(
|
||||
IN const pnfs_layout_state *state,
|
||||
IN const pnfs_layout *layout);
|
||||
|
||||
void pnfs_layout_recall_fenced(
|
||||
IN pnfs_layout_state *state,
|
||||
IN const pnfs_layout *layout);
|
||||
|
||||
/* expects caller to hold an exclusive lock on pnfs_layout_state */
|
||||
void pnfs_layout_io_start(
|
||||
IN pnfs_layout_state *state);
|
||||
|
|
|
|||
|
|
@ -332,19 +332,14 @@ static enum pnfs_status thread_next_unit(
|
|||
{
|
||||
pnfs_io_pattern *pattern = thread->pattern;
|
||||
pnfs_layout_state *state = pattern->state;
|
||||
enum pnfs_status status = PNFS_SUCCESS;
|
||||
enum pnfs_status status;
|
||||
|
||||
AcquireSRWLockShared(&state->lock);
|
||||
|
||||
/* stop io if the layout is recalled */
|
||||
if (state->status & PNFS_LAYOUT_CHANGED) {
|
||||
status = PNFSERR_LAYOUT_CHANGED;
|
||||
status = pnfs_layout_recall_status(state, &thread->layout->layout);
|
||||
if (status)
|
||||
goto out_unlock;
|
||||
}
|
||||
if (state->status & PNFS_LAYOUT_RECALLED) {
|
||||
status = PNFSERR_LAYOUT_RECALLED;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
status = stripe_next_unit(thread->layout, thread->id,
|
||||
&thread->offset, pattern->offset_end, io);
|
||||
|
|
@ -462,7 +457,8 @@ static uint64_t pattern_bytes_transferred(
|
|||
|
||||
static enum pnfs_status map_ds_error(
|
||||
IN enum nfsstat4 nfsstat,
|
||||
IN pnfs_layout_state *state)
|
||||
IN pnfs_layout_state *state,
|
||||
IN const pnfs_file_layout *layout)
|
||||
{
|
||||
switch (nfsstat) {
|
||||
case NO_ERROR:
|
||||
|
|
@ -477,10 +473,7 @@ static enum pnfs_status map_ds_error(
|
|||
case NFS4ERR_PNFS_NO_LAYOUT:
|
||||
dprintf(IOLVL, "data server fencing detected!\n");
|
||||
|
||||
AcquireSRWLockExclusive(&state->lock);
|
||||
/* flag the layout for return once io is finished */
|
||||
state->status |= PNFS_LAYOUT_RECALLED | PNFS_LAYOUT_CHANGED;
|
||||
ReleaseSRWLockExclusive(&state->lock);
|
||||
pnfs_layout_recall_fenced(state, &layout->layout);
|
||||
|
||||
/* return CHANGED to prevent any further use of the layout */
|
||||
return PNFSERR_LAYOUT_CHANGED;
|
||||
|
|
@ -535,7 +528,7 @@ static uint32_t WINAPI file_layout_read_thread(void *args)
|
|||
if (nfsstat) {
|
||||
eprintf("nfs41_read() failed with %s\n",
|
||||
nfs_error_string(nfsstat));
|
||||
status = map_ds_error(nfsstat, pattern->state);
|
||||
status = map_ds_error(nfsstat, pattern->state, thread->layout);
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
@ -610,7 +603,7 @@ retry_write:
|
|||
if (nfsstat) {
|
||||
eprintf("nfs41_write() failed with %s\n",
|
||||
nfs_error_string(nfsstat));
|
||||
status = map_ds_error(nfsstat, pattern->state);
|
||||
status = map_ds_error(nfsstat, pattern->state, thread->layout);
|
||||
break;
|
||||
}
|
||||
if (!verify_write(&verf, &thread->stable))
|
||||
|
|
@ -645,7 +638,7 @@ retry_write:
|
|||
commit_min, (uint32_t)(commit_max - commit_min), 0, &verf, NULL);
|
||||
|
||||
if (nfsstat)
|
||||
status = map_ds_error(nfsstat, pattern->state);
|
||||
status = map_ds_error(nfsstat, pattern->state, thread->layout);
|
||||
else if (!verify_commit(&verf)) {
|
||||
/* resend the writes unless the layout was recalled */
|
||||
if (status != PNFSERR_LAYOUT_RECALLED)
|
||||
|
|
|
|||
|
|
@ -55,6 +55,7 @@ static enum pnfs_status layout_state_create(
|
|||
|
||||
fh_copy(&layout->meta_fh, meta_fh);
|
||||
list_init(&layout->layouts);
|
||||
list_init(&layout->recalls);
|
||||
InitializeSRWLock(&layout->lock);
|
||||
InitializeConditionVariable(&layout->cond);
|
||||
|
||||
|
|
@ -80,10 +81,20 @@ static void layout_state_free_layouts(
|
|||
list_init(&state->layouts);
|
||||
}
|
||||
|
||||
static void layout_state_free_recalls(
|
||||
IN pnfs_layout_state *state)
|
||||
{
|
||||
struct list_entry *entry, *tmp;
|
||||
list_for_each_tmp(entry, tmp, &state->recalls)
|
||||
free(layout_entry(entry));
|
||||
list_init(&state->recalls);
|
||||
}
|
||||
|
||||
static void layout_state_free(
|
||||
IN pnfs_layout_state *state)
|
||||
{
|
||||
layout_state_free_layouts(state);
|
||||
layout_state_free_recalls(state);
|
||||
free(state);
|
||||
}
|
||||
|
||||
|
|
@ -702,13 +713,18 @@ enum pnfs_status pnfs_layout_state_prepare(
|
|||
IN uint64_t length)
|
||||
{
|
||||
unsigned char deviceid[PNFS_DEVICEID_SIZE];
|
||||
struct list_entry *entry;
|
||||
uint64_t missing;
|
||||
enum pnfs_status status;
|
||||
|
||||
/* check for layout recall */
|
||||
if (state->status & PNFS_LAYOUT_RECALLED) {
|
||||
status = PNFSERR_LAYOUT_RECALLED;
|
||||
goto out;
|
||||
/* fail if the range intersects any pending recalls */
|
||||
list_for_each(entry, &state->recalls) {
|
||||
const pnfs_layout *recall = layout_entry(entry);
|
||||
if (offset <= recall->offset + recall->length
|
||||
&& recall->offset <= offset + length) {
|
||||
status = PNFSERR_LAYOUT_RECALLED;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/* if part of the given range is not covered by a layout,
|
||||
|
|
@ -849,26 +865,175 @@ void pnfs_layout_state_close(
|
|||
|
||||
|
||||
/* pnfs_layout_recall */
|
||||
struct layout_recall {
|
||||
pnfs_layout layout;
|
||||
bool_t changed;
|
||||
};
|
||||
#define recall_entry(pos) list_container(pos, struct layout_recall, layout.entry)
|
||||
|
||||
/* expects the caller to have an exclusive lock */
|
||||
static void layout_recall_return(
|
||||
static bool_t layout_recall_compatible(
|
||||
IN const pnfs_layout *layout,
|
||||
IN const pnfs_layout *recall)
|
||||
{
|
||||
return layout->type == recall->type
|
||||
&& layout->offset <= (recall->offset + recall->length)
|
||||
&& recall->offset <= (layout->offset + layout->length)
|
||||
&& (recall->iomode == PNFS_IOMODE_ANY ||
|
||||
layout->iomode == recall->iomode);
|
||||
}
|
||||
|
||||
static pnfs_file_layout* layout_allocate_copy(
|
||||
IN const pnfs_file_layout *existing)
|
||||
{
|
||||
/* allocate a segment to cover the end of the range */
|
||||
pnfs_file_layout *layout = calloc(1, sizeof(pnfs_file_layout));
|
||||
if (layout == NULL)
|
||||
goto out;
|
||||
|
||||
memcpy(layout, existing, sizeof(pnfs_file_layout));
|
||||
|
||||
/* XXX: don't use the device from existing layout;
|
||||
* we need to get a reference for ourselves */
|
||||
layout->device = NULL;
|
||||
|
||||
/* allocate a copy of the filehandle array */
|
||||
layout->filehandles.arr = calloc(layout->filehandles.count,
|
||||
sizeof(nfs41_path_fh));
|
||||
if (layout->filehandles.arr == NULL)
|
||||
goto out_free;
|
||||
|
||||
memcpy(layout->filehandles.arr, existing->filehandles.arr,
|
||||
layout->filehandles.count * sizeof(nfs41_path_fh));
|
||||
out:
|
||||
return layout;
|
||||
|
||||
out_free:
|
||||
file_layout_free(layout);
|
||||
layout = NULL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
static void layout_recall_range(
|
||||
IN pnfs_layout_state *state,
|
||||
IN const pnfs_layout *recall)
|
||||
{
|
||||
struct list_entry *entry, *tmp;
|
||||
list_for_each_tmp(entry, tmp, &state->layouts) {
|
||||
pnfs_file_layout *layout = file_layout_entry(entry);
|
||||
const uint64_t layout_end = layout->layout.offset + layout->layout.length;
|
||||
|
||||
if (!layout_recall_compatible(&layout->layout, recall))
|
||||
continue;
|
||||
|
||||
if (recall->offset > layout->layout.offset) {
|
||||
/* segment starts before recall; shrink length */
|
||||
layout->layout.length = recall->offset - layout->layout.offset;
|
||||
|
||||
if (layout_end > recall->offset + recall->length) {
|
||||
/* middle chunk of the segment is recalled;
|
||||
* allocate a new segment to cover the end */
|
||||
pnfs_file_layout *remainder = layout_allocate_copy(layout);
|
||||
if (remainder == NULL) {
|
||||
/* silently ignore allocation errors here. behave
|
||||
* as if we 'forgot' this last segment */
|
||||
} else {
|
||||
layout->layout.offset = recall->offset + recall->length;
|
||||
layout->layout.length = layout_end - layout->layout.offset;
|
||||
layout_ordered_insert(state, &remainder->layout);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* segment starts after recall */
|
||||
if (layout_end <= recall->offset + recall->length) {
|
||||
/* entire segment is recalled */
|
||||
list_remove(&layout->layout.entry);
|
||||
file_layout_free(layout);
|
||||
} else {
|
||||
/* beginning of segment is recalled; shrink offset/length */
|
||||
layout->layout.offset = recall->offset + recall->length;
|
||||
layout->layout.length = layout_end - layout->layout.offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void layout_state_deferred_recalls(
|
||||
IN pnfs_layout_state *state)
|
||||
{
|
||||
dprintf(FLLVL, "layout_recall_return() 'forgetting' layout\n");
|
||||
struct list_entry *entry, *tmp;
|
||||
list_for_each_tmp(entry, tmp, &state->recalls) {
|
||||
/* process each deferred layout recall */
|
||||
pnfs_layout *recall = layout_entry(entry);
|
||||
layout_recall_range(state, recall);
|
||||
|
||||
layout_state_free_layouts(state);
|
||||
/* remove/free the recall entry */
|
||||
list_remove(&recall->entry);
|
||||
free(recall);
|
||||
}
|
||||
}
|
||||
|
||||
/* since we're forgetful, we don't actually return the layout;
|
||||
* just zero the stateid since it won't be valid anymore */
|
||||
ZeroMemory(&state->stateid, sizeof(state->stateid));
|
||||
state->status = 0;
|
||||
static void layout_recall_entry_init(
|
||||
OUT struct layout_recall *lrc,
|
||||
IN const struct cb_layoutrecall_args *recall)
|
||||
{
|
||||
list_init(&lrc->layout.entry);
|
||||
if (recall->recall.type == PNFS_RETURN_FILE) {
|
||||
lrc->layout.offset = recall->recall.args.file.offset;
|
||||
lrc->layout.length = recall->recall.args.file.length;
|
||||
} else {
|
||||
lrc->layout.offset = 0;
|
||||
lrc->layout.length = NFS4_UINT64_MAX;
|
||||
}
|
||||
lrc->layout.iomode = recall->iomode;
|
||||
lrc->layout.type = PNFS_LAYOUTTYPE_FILE;
|
||||
lrc->changed = recall->changed;
|
||||
}
|
||||
|
||||
static enum pnfs_status layout_recall_merge(
|
||||
IN struct list_entry *list,
|
||||
IN pnfs_layout *from)
|
||||
{
|
||||
struct list_entry *entry, *tmp;
|
||||
enum pnfs_status status = PNFSERR_NO_LAYOUT;
|
||||
|
||||
/* attempt to merge the new recall with each existing recall */
|
||||
list_for_each_tmp(entry, tmp, list) {
|
||||
pnfs_layout *to = layout_entry(entry);
|
||||
const uint64_t to_max = to->offset + to->length;
|
||||
const uint64_t from_max = from->offset + from->length;
|
||||
|
||||
/* the ranges must meet or overlap */
|
||||
if (to_max < from->offset || from_max < to->offset)
|
||||
continue;
|
||||
|
||||
/* the following fields must match: */
|
||||
if (to->iomode != from->iomode || to->type != from->type)
|
||||
continue;
|
||||
|
||||
dprintf(FLLVL, "merging recalled range {%llu, %llu} with {%llu, %llu}\n",
|
||||
to->offset, to->length, from->offset, from->length);
|
||||
|
||||
/* calculate the union of the two ranges */
|
||||
to->offset = min(to->offset, from->offset);
|
||||
to->length = max(to_max, from_max) - to->offset;
|
||||
|
||||
/* on success, remove/free the new segment */
|
||||
list_remove(&from->entry);
|
||||
free(from);
|
||||
status = PNFS_SUCCESS;
|
||||
|
||||
/* because the existing segment 'to' has grown, we may
|
||||
* be able to merge it with later segments */
|
||||
from = to;
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
static enum pnfs_status file_layout_recall(
|
||||
IN pnfs_layout_state *state,
|
||||
IN const struct cb_layoutrecall_args *recall)
|
||||
{
|
||||
const stateid4 *stateid_arg = &recall->recall.args.file.stateid;
|
||||
const stateid4 *stateid = &recall->recall.args.file.stateid;
|
||||
enum pnfs_status status = PNFS_SUCCESS;
|
||||
|
||||
/* under an exclusive lock, flag the layout as recalled */
|
||||
|
|
@ -877,26 +1042,42 @@ static enum pnfs_status file_layout_recall(
|
|||
if (state->stateid.seqid == 0) {
|
||||
/* return NOMATCHINGLAYOUT if it wasn't actually granted */
|
||||
status = PNFSERR_NO_LAYOUT;
|
||||
} else if (recall->recall.type == PNFS_RETURN_FILE
|
||||
&& stateid_arg->seqid > state->stateid.seqid + 1) {
|
||||
/* the server has processed an outstanding LAYOUTGET or LAYOUTRETURN;
|
||||
* we must return ERR_DELAY until we get the response and update our
|
||||
* view of the layout */
|
||||
status = PNFS_PENDING;
|
||||
} else if (state->io_count) {
|
||||
/* flag the layout as recalled so it can be returned after io */
|
||||
state->status |= PNFS_LAYOUT_RECALLED;
|
||||
if (recall->changed)
|
||||
state->status |= PNFS_LAYOUT_CHANGED;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (recall->recall.type == PNFS_RETURN_FILE) {
|
||||
/* detect races between CB_LAYOUTRECALL and LAYOUTGET/LAYOUTRETURN */
|
||||
if (stateid->seqid > state->stateid.seqid + 1) {
|
||||
/* the server has processed an outstanding LAYOUTGET or
|
||||
* LAYOUTRETURN; we must return ERR_DELAY until we get the
|
||||
* response and update our view of the layout */
|
||||
status = PNFS_PENDING;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* if we got a stateid, update the layout's seqid */
|
||||
if (recall->recall.type == PNFS_RETURN_FILE)
|
||||
state->stateid.seqid = stateid_arg->seqid;
|
||||
} else {
|
||||
/* if there is no pending io, return the layout now */
|
||||
layout_recall_return(state);
|
||||
/* save the updated seqid */
|
||||
state->stateid.seqid = stateid->seqid;
|
||||
}
|
||||
|
||||
if (state->io_count) {
|
||||
/* save an entry for this recall, and process it once io finishes */
|
||||
struct layout_recall *lrc = calloc(1, sizeof(struct layout_recall));
|
||||
if (lrc == NULL) {
|
||||
/* on failure to allocate, we'll have to respond
|
||||
* to the CB_LAYOUTRECALL with NFS4ERR_DELAY */
|
||||
status = PNFS_PENDING;
|
||||
goto out;
|
||||
}
|
||||
layout_recall_entry_init(lrc, recall);
|
||||
if (layout_recall_merge(&state->recalls, &lrc->layout) != PNFS_SUCCESS)
|
||||
list_add_tail(&state->recalls, &lrc->layout.entry);
|
||||
} else {
|
||||
/* if there is no pending io, process the recall immediately */
|
||||
struct layout_recall lrc = { 0 };
|
||||
layout_recall_entry_init(&lrc, recall);
|
||||
layout_recall_range(state, &lrc.layout);
|
||||
}
|
||||
out:
|
||||
ReleaseSRWLockExclusive(&state->lock);
|
||||
return status;
|
||||
}
|
||||
|
|
@ -1026,6 +1207,51 @@ out:
|
|||
return status;
|
||||
}
|
||||
|
||||
/* expects caller to hold a shared lock on pnfs_layout_state */
|
||||
enum pnfs_status pnfs_layout_recall_status(
|
||||
IN const pnfs_layout_state *state,
|
||||
IN const pnfs_layout *layout)
|
||||
{
|
||||
struct list_entry *entry;
|
||||
enum pnfs_status status = PNFS_SUCCESS;
|
||||
|
||||
/* search for a pending recall that intersects with the given segment */
|
||||
list_for_each(entry, &state->recalls) {
|
||||
const struct layout_recall *recall = recall_entry(entry);
|
||||
if (!layout_recall_compatible(layout, &recall->layout))
|
||||
continue;
|
||||
|
||||
if (recall->changed)
|
||||
status = PNFSERR_LAYOUT_CHANGED;
|
||||
else
|
||||
status = PNFSERR_LAYOUT_RECALLED;
|
||||
break;
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
void pnfs_layout_recall_fenced(
|
||||
IN pnfs_layout_state *state,
|
||||
IN const pnfs_layout *layout)
|
||||
{
|
||||
struct layout_recall *lrc = calloc(1, sizeof(struct layout_recall));
|
||||
if (lrc == NULL)
|
||||
return;
|
||||
|
||||
AcquireSRWLockExclusive(&state->lock);
|
||||
|
||||
list_init(&lrc->layout.entry);
|
||||
lrc->layout.offset = layout->offset;
|
||||
lrc->layout.length = layout->length;
|
||||
lrc->layout.iomode = layout->iomode;
|
||||
lrc->layout.type = layout->type;
|
||||
lrc->changed = TRUE;
|
||||
|
||||
if (layout_recall_merge(&state->recalls, &lrc->layout) != PNFS_SUCCESS)
|
||||
list_add_tail(&state->recalls, &lrc->layout.entry);
|
||||
|
||||
ReleaseSRWLockExclusive(&state->lock);
|
||||
}
|
||||
|
||||
/* expects caller to hold an exclusive lock on pnfs_layout_state */
|
||||
void pnfs_layout_io_start(
|
||||
|
|
@ -1051,9 +1277,8 @@ void pnfs_layout_io_finished(
|
|||
if (state->io_count > 0) /* more io pending */
|
||||
goto out_unlock;
|
||||
|
||||
/* once all io is finished, check for layout recalls */
|
||||
if (state->status & PNFS_LAYOUT_RECALLED)
|
||||
layout_recall_return(state);
|
||||
/* once all io is finished, process any layout recalls */
|
||||
layout_state_deferred_recalls(state);
|
||||
|
||||
/* finish any segment merging that was delayed during io */
|
||||
if (!list_empty(&state->layouts))
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue