/* $Id: pfs_nn_fsys.x 235 2010-06-20 22:23:05Z gerd $ -*- c -*- */
Filesystem
#include "pfs_types.x"
#ifndef PFS_NN_FSYS
#define PFS_NN_FSYS
program Filesystem {
    version V1 {
 
null
void null(void) = 0;
trans_id which can be freely chosen
	    by the client. A client may open several transactions
	    simultaneously.
When the TCP connection is closed, all open transactions are implicitly aborted.
Transacted operations must not overlap! This means when an op is started, the next op of the same transaction can be first called when the previous one sent the result back.
Transaction isolation: Only committed changes are visible from other transactions ("read committed"). For block lists, there is even a stronger guarantee. Once a block is returned to the client in a transaction, a competing delete request for this block is not immediately visible, but delayed until the transaction finishes that expects the block to exist. This is equivalent to a "repeatable read" isolation level.
Conflicting changes of the same piece of data or metadata can be resolved in various ways:
update_inodeinfo),
	      or if blocks are allocated or freed,
	      the inode is locked for the rest of the transaction, so that
	      no other transaction can change the inode in parallel.
	      The other transaction will get an ECONFLICT error.Read accesses usually do not acquire locks. However, there are some exceptions, and these are explained for each RPC.
Inode numbers are unique.
begin_transaction 
rvoid begin_transaction(trans_id) = 1;Starts a new transaction
commit_transaction
rvoid commit_transaction(trans_id) = 2;Commits a transaction and makes its effects permanent. At commit time, there cannot be any logical inconsistencies that would prevent it. However, a commit may fail when data cannot be physically written out.
abort_transaction
rvoid abort_transaction(trans_id) = 3;Aborts the transaction
get_inodeinfo
rinodeinfo get_inodeinfo(trans_id, hyper) = 4;
get_inodeinfo(tid, inode): Returns the inodeinfo struct
	    for the inode with the ID inode.
allocate_inode
rhyper allocate_inode(trans_id, inodeinfo) = 5;
allocate_inode(tid, ii): Creates a new inode and initializes
	    the inodeinfo struct to ii. The inode is locked.
Note that latest at commit time an inode must be associated with at least one file name. Otherwise it is implicitly deleted.
update_inodeinfo
rvoid update_inodeinfo(trans_id, hyper, inodeinfo) = 6;
update_inodeinfo(tid, inode, ii): Updates the inodeinfo
	   struct of inode to ii. The inode is locked for that.
           Only updates of these fields are possible:
           usergroup, mode, eof, mtime, ctime, replication,
	   field1, create_verifier. An update of replication does
	   only change the required replication, but not the actual
	   replication.
	   The seqno field cannot be modified.
delete_inode
rvoid delete_inode(trans_id, hyper) = 7;
delete_inode(tid, inode): Deletes the inode. The inode
	    is locked for that.
get_blocks
rblocklist get_blocks(trans_id, hyper, hyper, hyper) = 8;
get_blocks(tid, inode, index, len): Get information about blocks
           index to index+len-1 of the inode. There may be several
           blockinfo structs for an index if the block is replicated.
	   Once returned by get_blocks, the blocks are guaranteed to
	   exist if a competing transaction requests their deletion,
	   and even if the delete is committed.  This guarantee is
	   valid for the duration of the transaction tid. This means
	   that deletes may be delayed for the time there are still
	   transactions expecting the blocks to exist.
allocate_blocks
rblocklist allocate_blocks(trans_id, hyper, hyper, hyper, bool, longstrings) = 9;
allocate_blocks(tid, inode, index, len, set_mtime, data_pref): 
           Allocate new blocks
           for the range index to index+len-1. Old blocks in this
           range are freed. Blocks are allocated on various nodes
           respecting the replication policy. The inode is locked.
	   set_mtime: Whether to set mtime to the server time.
	   data_pref: These datanodes are preferred for storing the
	   blocks. The nodes must be given as identity names.
	   This list is only a suggestion. For every block it is tried
	   to allocate it on one of the preferred nodes, even if the
	   allocation becomes unbalanced. However, if it is not possible
	   to follow the suggestion it is ignored. If a node name
	   cannot be identified, the element of data_pref is silently
	   ignored. This parameter is mostly useful to make it highly
	   likely that blocks are stored locally - on the same machine
	   as the machine runnning the requesting client.
It is only allowed to allocate blocks for regular files.
free_blocks
rvoid free_blocks(trans_id, hyper, hyper, hyper, bool) = 10;
free_blocks(tid, inodenr, index, len, set_mtime): Frees the blocks
           index to index+len-1. It is not an error to free a block
           that was not allocated. The inode is locked.
	   set_mtime: Whether to set mtime to the server time.
get_fsstat
rfsstat get_fsstat(void) = 11;
rereplicate
rvoid rereplicate(trans_id, hyper) = 12; /* replicate(inode): Fixes the replication */ /* not yet implemented */
get_blocksize
int get_blocksize(void) = 13;returns the blocksize
           When a filename is created, the parent directory must already
           exist (in the link operation). For regular files and symlinks
           it is allowed that the inode is connected to several filenames.
           With unlink the filename is deleted. Unlike in Unix the
           last unlink operation for an inode does not delete the inode
           automatically. The delete is delayed until the transaction
	   is committed. (So a file can be renamed by first unlinking the
	   old name, and then linking the new name.)
           For directories, it is required that the directory
           is empty before unlink. It is not possible to delete "/".
There are three kinds of locks for filenames:
	   For example, assume there is a directory /dir. Transaction 1
	   creates a file in this directory /dir/file. While the
	   transaction is open, an existence lock on /dir and a
	   creation lock on /dir/file are held. A competing transaction 2
	   tries to delete the directory /dir. Of course, both transactions
	   cannot be committed together - they are logically inconsistent.
	   What actually happens, depends on the order of the operations:
	   If the file is created first, transaction 1 gets all its locks,
	   and transaction 2 fails when trying to get an unlink lock
	   for /dir. If the deletion occurs first, transaction 2 gets
	   the unlink lock on /dir, and transaction 1 fails to acquire
	   the existence lock on /dir.
lookup
rhyper lookup(trans_id, longstring, hyper) = 14;
lookup(tid, path, parent): This RPC is used to look up
	    file names and path names. There are three data cases:
path is an absolute path, and parent=(-1): This looks
	      up the path and returns the inode.path is an absolute path, and parent is the inode
	      of a directory. This also looks up the path, but this
	      is only successful if the directory containing the path
	      has the inode parent. Also, in this case a temporary
	      existence lock of the directory is required.path is a simple file name not containing a slash,
	      and parent is the inode of a directory. This looks 
	      up the filename relative to the directory. In this
	      case a temporary existence lock of the directory is
	      needed, too.
	   In all cases, the resulting effective absolute path of the
	   file is existence-locked until the end of the transaction.
	   If locks cannot be acquired, this RPC fails with an 
	   ECONFLICT error code. This might be surprising for a
	   read-type RPC.
There is no symlink resolution. Also, "." and ".." are not treated specially.
rev_lookup
rlongstrings rev_lookup(trans_id, hyper) = 15;
rev_lookup(tid, inode):
Reverse lookup for this inode: Returns the connected filenames.
This RPC does not acquire locks.
link_count
rint link_count(trans_id, hyper) = 16;
link_count(tid, inode): 
	    Returns the number of filenames linked with this inode.
	    Note that this number is not what Unix puts into the nlink
	    field of a stat, because PlasmaFS does not create links for
	    "." and "..".
This RPC does not acquire locks.
link
        rvoid link(trans_id, longstring, hyper) = 17;
 link(tid, path, inode):
           Creates this filename, and links it with this inode.
This implicitly sets the ctime of the inode and the mtime of the directory inode to the current server time.
An existence lock on the directory containing the new file, and a creation lock for the file are obtained.
unlink
        rvoid unlink(trans_id, longstring) = 18;
 unlink(tid, path): Deletes this filename
This implicitly sets the ctime of the inode and the mtime of the directory inode to the current server time.
This locks the inode, and acquires an unlink lock for the path.
If the number of links for the inode drops to 0 at commit time, the inode is implicitly deleted.
list
rentries list(trans_id, hyper) = 19;
list(tid, inode):
	    Lists the contents of the directory. Only a single directory
            can be listed (no recursion). The RPC returns the basenames
            of the contained files only (path information stripped)
The path name of the directory is existende-locked.
set_block_checksumset_block_checksum for every written
	   block.
rvoid set_block_checksum(trans_id, hyper, hyper, longstring) = 20;
set_block_checksum(tid, inode, index, checksum) 
 
iterate
rhypers iterate(trans_id, hyper, int) = 21;iterate by inode: iterate(inode, n) returns the up to n smallest inodes that are larger than
inode. (Privileged operation.)
	/* not yet implemented */
    } = 1;
} = 0x8000e001;
#endif