/* $Id: pfs_nn_fsys.x 235 2010-06-20 22:23:05Z gerd $ -*- c -*- */
/** {1:filesystem [Filesystem]} */
/** Filesystem access */
#include "pfs_types.x"
#ifndef PFS_NN_FSYS
#define PFS_NN_FSYS
program Filesystem {
version V1 {
/** {2 [null]} */
void null(void) = 0;
/** {2 Transactions} */
/** Many procedures are run inside a transaction. A transaction
is identified by a [trans_id] which can be freely chosen
by the client. A client may open several transactions
simultaneously.
When the TCP connection is closed, all open transactions are
implicitly aborted.
Transacted operations must not overlap! This means when an
op is started, the next op of the same transaction can be
first called when the previous one sent the result back.
Transaction isolation: Only committed changes are visible
from other transactions ("read committed"). For block
lists, there is even a stronger guarantee. Once a block is
returned to the client in a transaction, a competing
delete request for this block is not immediately visible,
but delayed until the transaction finishes that expects
the block to exist. This is equivalent to a "repeatable
read" isolation level.
Conflicting changes of the same piece of data or metadata
can be resolved in various ways:
{ul
{- If the inode is directly modified (via [update_inodeinfo]),
or if blocks are allocated or freed,
the inode is locked for the rest of the transaction, so that
no other transaction can change the inode in parallel.
The other transaction will get an [ECONFLICT] error.}
{- The inode can also be indirectly modified, e.g. mtime
updates because data is written. These modifications do
not lock the inode. In these cases, the last commit wins,
and overwrites the changes of previous commits.}
{- Directories have a different locking system. A file path
can be locked in three different ways: An {i existence lock}
ensures that no other transaction can delete it. For example,
this kind of lock is acquired for the path of a directory
before a file is created in that directory. A {i creation
lock} is acquired for files that are created exclusively.
An {i unlink lock} is acquired for files that are going
to be deleted.}
}
Read accesses usually do not acquire locks. However, there
are some exceptions, and these are explained for each RPC.
Inode numbers are unique.
*/
/** {2:begin_transaction [begin_transaction] } */
rvoid begin_transaction(trans_id) = 1;
/** Starts a new transaction */
/** {2:commit_transaction [commit_transaction]} */
rvoid commit_transaction(trans_id) = 2;
/** Commits a transaction and makes its effects permanent. At
commit time, there cannot be any logical inconsistencies
that would prevent it. However, a commit may fail when
data cannot be physically written out.
*/
/** {2:abort_transaction [abort_transaction]} */
rvoid abort_transaction(trans_id) = 3;
/** Aborts the transaction */
/** {2 Inodes} */
/** {2:get_inodeinfo [get_inodeinfo]} */
rinodeinfo get_inodeinfo(trans_id, hyper) = 4;
/** [get_inodeinfo(tid, inode)]: Returns the [inodeinfo] struct
for the inode with the ID [inode].
*/
/** {2:allocate_inode [allocate_inode]} */
rhyper allocate_inode(trans_id, inodeinfo) = 5;
/** [allocate_inode(tid, ii)]: Creates a new inode and initializes
the [inodeinfo] struct to [ii]. The inode is locked.
Note that latest at commit time an inode must be associated
with at least one file name. Otherwise it is implicitly
deleted.
*/
/** {2:update_inodeinfo [update_inodeinfo]} */
rvoid update_inodeinfo(trans_id, hyper, inodeinfo) = 6;
/** [update_inodeinfo(tid, inode, ii)]: Updates the [inodeinfo]
struct of [inode] to [ii]. The inode is locked for that.
Only updates of these fields are possible:
[usergroup], [mode], [eof], [mtime], [ctime], [replication],
[field1], [create_verifier]. An update of [replication] does
only change the required replication, but not the actual
replication.
The [seqno] field cannot be modified.
*/
/** {2:delete_inode [delete_inode]} */
rvoid delete_inode(trans_id, hyper) = 7;
/** [delete_inode(tid, inode)]: Deletes the inode. The inode
is locked for that.
*/
/** {2 Block lists} */
/** {2:get_blocks [get_blocks]} */
rblocklist get_blocks(trans_id, hyper, hyper, hyper) = 8;
/** [get_blocks(tid, inode, index, len)]: Get information about blocks
[index] to [index+len-1] of the [inode]. There may be several
blockinfo structs for an index if the block is replicated.
Once returned by [get_blocks], the blocks are guaranteed to
exist if a competing transaction requests their deletion,
and even if the delete is committed. This guarantee is
valid for the duration of the transaction [tid]. This means
that deletes may be delayed for the time there are still
transactions expecting the blocks to exist.
*/
/** {2:allocate_blocks [allocate_blocks]} */
rblocklist allocate_blocks(trans_id, hyper, hyper, hyper, bool,
longstrings) = 9;
/** [allocate_blocks(tid, inode, index, len, set_mtime, data_pref)]:
Allocate new blocks
for the range [index] to [index+len-1]. Old blocks in this
range are freed. Blocks are allocated on various nodes
respecting the replication policy. The inode is locked.
[set_mtime]: Whether to set [mtime] to the server time.
[data_pref]: These datanodes are preferred for storing the
blocks. The nodes must be given as identity names.
This list is only a suggestion. For every block it is tried
to allocate it on one of the preferred nodes, even if the
allocation becomes unbalanced. However, if it is not possible
to follow the suggestion it is ignored. If a node name
cannot be identified, the element of [data_pref] is silently
ignored. This parameter is mostly useful to make it highly
likely that blocks are stored locally - on the same machine
as the machine runnning the requesting client.
It is only allowed to allocate blocks for regular files.
*/
/** {2:free_blocks [free_blocks]} */
rvoid free_blocks(trans_id, hyper, hyper, hyper, bool) = 10;
/** [free_blocks(tid, inodenr, index, len, set_mtime)]: Frees the blocks
[index] to [index+len-1]. It is not an error to free a block
that was not allocated. The inode is locked.
[set_mtime]: Whether to set [mtime] to the server time.
*/
/** {2 statistics} */
/** {2 [get_fsstat]} */
rfsstat get_fsstat(void) = 11;
/** {2 replication control} */
/** {2 [rereplicate]} */
rvoid rereplicate(trans_id, hyper) = 12;
/* replicate(inode): Fixes the replication */
/* not yet implemented */
/** {2 Misc} */
/** {2 [get_blocksize]} */
int get_blocksize(void) = 13;
/** returns the blocksize */
/** {2 Directories and filenames} */
/** An existing inode can be connected with a filename. Filenames
are as in Unix (slash-separated). All filenames must start with
a slash.
When a filename is created, the parent directory must already
exist (in the [link] operation). For regular files and symlinks
it is allowed that the inode is connected to several filenames.
With [unlink] the filename is deleted. Unlike in [Unix] the
last [unlink] operation for an inode does not delete the inode
automatically. The delete is delayed until the transaction
is committed. (So a file can be renamed by first unlinking the
old name, and then linking the new name.)
For directories, it is required that the directory
is empty before [unlink]. It is not possible to delete "/".
There are three kinds of locks for filenames:
{ul
{- {i Existence locks}: This means that the filename must not
be deleted by a competing transaction. Existence locks are
non-exclusive, i.e. several transaction can hold them for the
same file.}
{- {i Unlink locks}: This is the counterpart - this lock
means that the filename is going to be deleted. Unlink
locks are also non-exclusive. Of course, it is not possible
that they can coexist with existence locks.}
{- {i Creation locks}: This type of lock means that a filename
is being created in an exclusive way. This lock can only be
acquired once.}
}
For example, assume there is a directory [/dir]. Transaction 1
creates a file in this directory [/dir/file]. While the
transaction is open, an existence lock on [/dir] and a
creation lock on [/dir/file] are held. A competing transaction 2
tries to delete the directory [/dir]. Of course, both transactions
cannot be committed together - they are logically inconsistent.
What actually happens, depends on the order of the operations:
If the file is created first, transaction 1 gets all its locks,
and transaction 2 fails when trying to get an unlink lock
for [/dir]. If the deletion occurs first, transaction 2 gets
the unlink lock on [/dir], and transaction 1 fails to acquire
the existence lock on [/dir].
*/
/** {2:lookup [lookup]} */
rhyper lookup(trans_id, longstring, hyper) = 14;
/** [lookup(tid, path, parent)]: This RPC is used to look up
file names and path names. There are three data cases:
{ul
{- [path] is an absolute path, and [parent=(-1)]: This looks
up the path and returns the inode.}
{- [path] is an absolute path, and [parent] is the inode
of a directory. This also looks up the path, but this
is only successful if the directory containing the path
has the inode [parent]. Also, in this case a temporary
existence lock of the directory is required.}
{- [path] is a simple file name not containing a slash,
and [parent] is the inode of a directory. This looks
up the filename relative to the directory. In this
case a temporary existence lock of the directory is
needed, too.}
}
In all cases, the resulting effective absolute path of the
file is existence-locked until the end of the transaction.
If locks cannot be acquired, this RPC fails with an
[ECONFLICT] error code. This might be surprising for a
read-type RPC.
There is no symlink resolution. Also, "." and ".." are not
treated specially.
*/
/** {2 [rev_lookup]} */
rlongstrings rev_lookup(trans_id, hyper) = 15;
/** [rev_lookup(tid, inode)]:
Reverse lookup for this inode: Returns the connected filenames.
This RPC does not acquire locks.
*/
/** {2 [link_count]} */
rint link_count(trans_id, hyper) = 16;
/** [link_count(tid, inode)]:
Returns the number of filenames linked with this inode.
Note that this number is not what Unix puts into the [nlink]
field of a stat, because PlasmaFS does not create links for
"." and "..".
This RPC does not acquire locks.
*/
/** {2:link [link]} */
rvoid link(trans_id, longstring, hyper) = 17;
/** [link(tid, path, inode)]:
Creates this filename, and links it with this inode.
This implicitly sets the ctime of the inode and the mtime
of the directory inode to the current server time.
An existence lock on the directory containing the new file,
and a creation lock for the file are obtained.
*/
/** {2:unlink [unlink]} */
rvoid unlink(trans_id, longstring) = 18;
/** [unlink(tid, path)]: Deletes this filename
This implicitly sets the ctime of the inode and the mtime
of the directory inode to the current server time.
This locks the inode, and acquires an unlink lock for the
path.
If the number of links for the inode drops to 0 at commit
time, the inode is implicitly deleted.
*/
/** {2:list [list]} */
rentries list(trans_id, hyper) = 19;
/** [list(tid, inode)]:
Lists the contents of the directory. Only a single directory
can be listed (no recursion). The RPC returns the basenames
of the contained files only (path information stripped)
The path name of the directory is existende-locked.
*/
/** {2 [set_block_checksum]} */
/** The checksums are not automatically set.
The client has to call [set_block_checksum] for every written
block.
*/
rvoid set_block_checksum(trans_id, hyper, hyper, longstring) = 20;
/** [set_block_checksum(tid, inode, index, checksum)] */
/** {2 [iterate]} */
rhypers iterate(trans_id, hyper, int) = 21;
/** iterate by inode: iterate(inode, n) returns the up to n smallest
inodes that are larger than [inode]. (Privileged operation.)
*/
/* not yet implemented */
} = 1;
} = 0x8000e001;
#endif
/** */