/* $Id: pfs_nn_fsys.x 327 2010-11-18 16:44:40Z gerd $ -*- c -*- */
Filesystem
#include "pfs_types.x" #ifndef PFS_NN_FSYS #define PFS_NN_FSYS program Filesystem { version V1 {
null
void null(void) = 0;
trans_id
which can be freely chosen
by the client. A client may open several transactions
simultaneously.
When the TCP connection is closed, all open transactions are implicitly aborted.
Transacted operations must not overlap! This means when an op is started, the next op of the same transaction can be first called when the previous one sent the result back.
Transaction isolation: Only committed changes are visible from other transactions ("read committed"). For block lists, there is even a stronger guarantee. Once a block is returned to the client in a transaction, a competing delete request for this block is not immediately visible, but delayed until the transaction finishes that expects the block to exist. This is equivalent to a "repeatable read" isolation level.
Conflicting changes of the same piece of data or metadata can be resolved in various ways:
update_inodeinfo
),
or if blocks are allocated or freed,
the inode is locked for the rest of the transaction, so that
no other transaction can change the inode in parallel.
The other transaction will get an ECONFLICT
error.Read accesses usually do not acquire locks. However, there are some exceptions, and these are explained for each RPC.
Inode numbers are unique.
begin_transaction
rvoid begin_transaction(trans_id) = 1;Starts a new transaction
commit_transaction
rvoid commit_transaction(trans_id) = 2;Commits a transaction and makes its effects permanent. At commit time, there cannot be any logical inconsistencies that would prevent it. However, a commit may fail when data cannot be physically written out.
abort_transaction
rvoid abort_transaction(trans_id) = 3;Aborts the transaction
get_inodeinfo
rinodeinfo get_inodeinfo(trans_id, hyper) = 4;
get_inodeinfo(tid, inode)
: Returns the inodeinfo
struct
for the inode with the ID inode
.
allocate_inode
rhyper allocate_inode(trans_id, inodeinfo) = 5;
allocate_inode(tid, ii)
: Creates a new inode and initializes
the inodeinfo
struct to ii
. The inode is locked.
Note that latest at commit time an inode must be associated with at least one file name. Otherwise it is implicitly deleted.
update_inodeinfo
rvoid update_inodeinfo(trans_id, hyper, inodeinfo) = 6;
update_inodeinfo(tid, inode, ii)
: Updates the inodeinfo
struct of inode
to ii
. The inode is locked for that.
Only updates of these fields are possible:
usergroup
, mode
, eof
, mtime
, ctime
, replication
,
field1
, create_verifier
. An update of replication
does
only change the required replication, but not the actual
replication.
The seqno
field cannot be modified.
delete_inode
rvoid delete_inode(trans_id, hyper) = 7;
delete_inode(tid, inode)
: Deletes the inode. The inode
is locked for that.
Block lists
get_blocks
rblocklist get_blocks(trans_id, hyper, hyper, hyper, bool) = 8;
get_blocks(tid, inode, index, len, keep)
: Get information about
blocks
index
to index+len-1
of the inode
. There may be several
blockinfo structs for an index if the block is replicated.
If the keep
flag is set, the returned blocks are guaranteed to
exist if a competing transaction requests their deletion,
and even if the delete is committed. This guarantee is
valid for the duration of the transaction tid
. This means
that deletes may be delayed for the time there are still
transactions expecting the blocks to exist. Without keep
the returned block list may already be outdated when the
caller receives it (which may still be useful for some purposes).
Implementing keep
has some cost, so it is allowed to turn
this feature off.
allocate_blocks
rblocklist allocate_blocks(trans_id, hyper, hyper, hyper, bool, longstrings) = 9;
allocate_blocks(tid, inode, index, len, set_mtime, data_pref)
:
Allocate new blocks
for the range index
to index+len-1
. Old blocks in this
range are freed. Blocks are allocated on various nodes
respecting the replication policy. The inode is locked.
set_mtime
: Whether to set mtime
to the server time.
data_pref
: These datanodes are preferred for storing the
blocks. The nodes must be given as identity names.
This list is only a suggestion. For every block it is tried
to allocate it on one of the preferred nodes, even if the
allocation becomes unbalanced. However, if it is not possible
to follow the suggestion it is ignored. If a node name
cannot be identified, the element of data_pref
is silently
ignored. This parameter is mostly useful to make it highly
likely that blocks are stored locally - on the same machine
as the machine runnning the requesting client.
It is only allowed to allocate blocks for regular files.
free_blocks
rvoid free_blocks(trans_id, hyper, hyper, hyper, bool) = 10;
free_blocks(tid, inodenr, index, len, set_mtime)
: Frees the blocks
index
to index+len-1
. It is not an error to free a block
that was not allocated. The inode is locked.
set_mtime
: Whether to set mtime
to the server time.
statistics
get_fsstat
rfsstat get_fsstat(void) = 11;
rereplicate
rvoid rereplicate(trans_id, hyper) = 12; /* replicate(inode): Fixes the replication */ /* not yet implemented */
get_blocksize
int get_blocksize(void) = 13;returns the blocksize
When a filename is created, the parent directory must already
exist (in the link
operation). For regular files and symlinks
it is allowed that the inode is connected to several filenames.
With unlink
the filename is deleted. Unlike in Unix
the
last unlink
operation for an inode does not delete the inode
automatically. The delete is delayed until the transaction
is committed. (So a file can be renamed by first unlinking the
old name, and then linking the new name.)
For directories, it is required that the directory
is empty before unlink
. It is not possible to delete "/".
There are three kinds of locks for filenames:
The locks are not granted for the whole path under which a file is known, but only for the last component of the path relative to the containing directory. The directory can be moved.
For example, assume there is a directory /dir
. Transaction 1
creates a file in this directory /dir/file
. While the
transaction is open, an existence lock on /dir
and a
creation lock on /dir/file
are held. A competing transaction 2
tries to delete the directory /dir
. Of course, both transactions
cannot be committed together - they are logically inconsistent.
What actually happens, depends on the order of the operations:
If the file is created first, transaction 1 gets all its locks,
and transaction 2 fails when trying to get an unlink lock
for /dir
. If the deletion occurs first, transaction 2 gets
the unlink lock on /dir
, and transaction 1 fails to acquire
the existence lock on /dir
.
lookup
rhyper lookup(trans_id, hyper, longstring, bool) = 14;
lookup(tid, dir_inode, path, symbolic)
: This RPC is used to look up
file names and path names. Possible data cases:
path
is an absolute path, and dir_inode=(-1)
: This looks
up the path and returns the inode.path
is a relative path, and dir_inode
is a real
inode of a directory: This walks down the path starting
at dir_inode
.
The lookup resolves symbolic links. If the symbolic
flag is
true, the last component of path
is excluded from symbolic
link resolution.
This RPC does not acquire locks.
rev_lookup
rlongstrings rev_lookup(trans_id, hyper) = 15;
rev_lookup(tid, inode)
:
Reverse lookup for this inode: Returns the connected filenames.
This RPC does not acquire locks.
rev_lookup_dir
rlongstring rev_lookup_dir(trans_id, hyper) = 16;
rev_lookup_dir(tid, inode)
:
Reverse lookup for this inode: If the inode is a directory
returns the absolute name of the directory. Otherwise
an error is returned (ENOIENT
if the inode does not exist,
or EFHIER
if the inode is not a directory).
The returned name is existence-locked.
link_count
rint link_count(trans_id, hyper) = 17;
link_count(tid, inode)
:
Returns the number of filenames linked with this inode.
Note that this number is not what Unix puts into the nlink
field of a stat, because PlasmaFS does not create links for
"." and "..".
This RPC does not acquire locks.
link
rvoid link(trans_id, longstring, hyper) = 18;
link(tid, path, inode)
:
Creates this (absolute) filename, and links it with this inode.
This implicitly sets the ctime of the inode and the mtime of the directory inode to the current server time.
An existence lock on the directory containing the new file, and a creation lock for the file are obtained. Note that it is required that the directory is also linked in the file hierarchy (i.e. you cannot put new links into directories without name).
rvoid link_at(trans_id, hyper, longstring, hyper) = 19;
link_at(tid, dir_inode, name, inode)
:
Creates a new name
in the directory referenced by dir_inode
.
The name is connected with inode
.
unlink
rvoid unlink(trans_id, longstring) = 20;
unlink(tid, path)
: Deletes this filename
This implicitly sets the ctime of the inode and the mtime of the directory inode to the current server time.
This locks the inode, and acquires an unlink lock for the path.
If the number of links for the inode drops to 0 at commit time, the inode is implicitly deleted.
rvoid unlink_at(trans_id, hyper, longstring) = 21;
unlink_at(tid, dir_inode, name)
: Deletes the member name
from the directory referenced by dir_inode
.
list
rentries list(trans_id, hyper) = 22;
list(tid, inode)
:
Lists the contents of the directory. Only a single directory
can be listed (no recursion). The RPC returns the basenames
of the contained files only (path information stripped)
The path name of the directory is existende-locked.
rename
rvoid rename(trans_id, longstring, longstring) = 23;
rename(tid, old_path, new_path)
:
Renames old_path
into new_path
. It is required that
new_path
does not yet exist.
rvoid rename_at(trans_id, hyper, longstring, hyper, longstring) = 24;
rename_at(tid, old_dir_inode, old_name, new_dir_inode, new_name)
:
Renames the path identified by old_name
in old_dir_inode
into the path identified by new_name
in new_dir_inode
.
namelock
rvoid namelock(trans_id, hyper, longstring) = 25;
namelock(tid, dir_inode, name)
: Acquires an existence lock
on the member name
of the directory referenced by dir_inode
.
This means that a competing transaction cannot delete this name,
or rename it. The protection is valid until the end of the
transaction.
Note that it is also not possible to delete the containing
directory, i.e. dir_inode
, because only empty directories
can be deleted. However, this directory can be renamed and
moved away. So this lock does not protect against changes of
the path under which dir_inode
is known.
Fails with ECONFLICT
if the lock cannot be acquired.
It is required that the directory exists, and that there is
a member name
. Otherwise ENOENT
is returned.
The checksums are not automatically set.
The client has to call set_block_checksum
set_block_checksum
for every written
block.
rvoid set_block_checksum(trans_id, hyper, hyper, longstring) = 26;
set_block_checksum(tid, inode, index, checksum)
iterate
rhypers iterate(trans_id, hyper, int) = 27;iterate by inode: iterate(inode, n) returns the up to n smallest inodes that are larger than
inode
. (Privileged operation.)
/* not yet implemented */ } = 1; } = 0x8000e001; #endif