Plasma GitLab Archive
Projects Blog Knowledge

/* $Id: pfs_nn_fsys.x 536 2011-12-11 22:59:10Z gerd $ -*- c -*- */

/** {1:filesystem [Filesystem]} */

/** Filesystem access */

#include "pfs_types.x"

#ifndef PFS_NN_FSYS
#define PFS_NN_FSYS

program Filesystem {
    version V1 {

	/** {2 [null]} */

	void null(void) = 0;

	/** {2 Transactions} */

	/** Many procedures are run inside a transaction. A transaction
	    is identified by a [trans_id] which can be freely chosen
	    by the client. A client may open several transactions
	    simultaneously.

	    When the TCP connection is closed, all open transactions are
	    implicitly aborted.

	    Transacted operations must not overlap! This means when an
            op is started, the next op of the same transaction can be
            first called when the previous one sent the result back.

	    Transaction isolation: Only committed changes are visible
	    from other transactions ("read committed"). For block
	    lists, there is even a stronger guarantee. Once a block is
	    returned to the client in a transaction, a competing
	    delete request for this block is not immediately visible,
	    but delayed until the transaction finishes that expects
	    the block to exist. This is equivalent to a "repeatable
	    read" isolation level.
	    
	    Conflicting changes of the same piece of data or metadata
	    can be resolved in various ways:

	    {ul
	    {- If the inode is directly modified (via [update_inodeinfo]),
	      or if blocks are allocated or freed,
	      the inode is locked for the rest of the transaction, so that
	      no other transaction can change the inode in parallel.
	      The other transaction will get an [ECONFLICT] error.}
	    {- The inode can also be indirectly modified, e.g. mtime
	      updates because data is written. These modifications do
	      not lock the inode. In these cases, the last commit wins,
	      and overwrites the changes of previous commits.}
	    {- Directories have a different locking system. A file name
	      can be locked in three different ways: An {i existence lock}
	      ensures that no other transaction can delete it. For example,
	      this kind of lock is acquired for the path of a directory
	      before a file is created in that directory. A {i creation
	      lock} is acquired for files that are created exclusively.
	      An {i unlink lock} is acquired for files that are going
	      to be deleted.}
            }
	      
            Read accesses usually do not acquire locks. However, there
	    are some exceptions, and these are explained for each RPC.

	    Inode numbers are unique.
	*/

	/** {2:begin_transaction [begin_transaction] } */

	rvoid begin_transaction(trans_id) = 1;

	/** Starts a new transaction */

	/** {2:commit_transaction [commit_transaction]} */

	rvoid commit_transaction(trans_id) = 2;

	/** Commits a transaction and makes its effects permanent. At
	    commit time, there cannot be any logical inconsistencies
	    that would prevent it. However, a commit may fail when
	    data cannot be physically written out.
	*/

	/** {2:abort_transaction [abort_transaction]} */

	rvoid abort_transaction(trans_id) = 3;

	/** Aborts the transaction */


	/** {2 Inodes} */


	/** {2:get_inodeinfo [get_inodeinfo]} */

	rinodeinfo get_inodeinfo(trans_id, hyper) = 4;
	/** [get_inodeinfo(tid, inode)]: Returns the [inodeinfo] struct
	    for the inode with the ID [inode].
	*/

	/** {2:allocate_inode [allocate_inode]} */

	rhyper allocate_inode(trans_id, inodeinfo) = 5;
	/** [allocate_inode(tid, ii)]: Creates a new inode and initializes
	    the [inodeinfo] struct to [ii]. The inode is locked.

	    Note that latest at commit time an inode must be associated
	    with at least one file name. Otherwise it is implicitly
	    deleted.

	    The [seqno] field of [ii] must be set to 0 (else [EINVAL]). 
	    The file is created with a [seqno] field of 1.
	*/

	/** {2:update_inodeinfo [update_inodeinfo]} */

	rvoid update_inodeinfo(trans_id, hyper, inodeinfo) = 6;
	/** [update_inodeinfo(tid, inode, ii)]: Updates the [inodeinfo]
	   struct of [inode] to [ii]. The inode is locked for that.

           Only updates of these fields are possible:
           [usergroup], [mode], [eof], [mtime], [ctime], [replication],
	   [field1], [create_verifier]. An update of [replication] does
	   only change the required replication, but not the actual
	   replication.

	   The [seqno] field cannot be modified.
	*/

	/** {2:delete_inode [delete_inode]} */

	rvoid delete_inode(trans_id, hyper) = 7;
	/** [delete_inode(tid, inode)]: Deletes the inode. The inode
	    is locked for that.
	*/

	/** {2 Block lists} */
	
	/** {2:get_blocks [get_blocks]} */

	rblocklist get_blocks(trans_id, hyper, hyper, hyper, hyper, bool) = 8;
	/** [get_blocks(tid, inode, index, len, seqno, pin)]: 
	    Get information about blocks
           [index] to [index+len-1] of the [inode]. There may be several
           blockinfo structs for an index if the block is replicated.

	   If the [pin] flag is set, the returned blocks contain
	   the datanode tickets needed to actually retrieve the blocks,
	   and the blocks are guaranteed to
	   exist even if a competing transaction requests their deletion,
	   and even if the delete is committed.  This guarantee is
	   valid for the duration of the transaction [tid]. This means
	   that deletes may be delayed for the time there are still
	   transactions expecting the blocks to exist. Without [pin]
	   the returned block list may already be outdated when the
	   caller receives it. As a consequence of this it is not allowed 
	   to retrieve the blocks. This type of call can still be useful
	   to get statistical information about the physical location of 
	   the blocks.
	   Implementing [pin] has some cost, so it is allowed to turn
	   this feature off.

	   As data blocks are immutable, [get_blocks] has the effect
	   of taking a snapshot of the requested file range. By requesting
	   all blocks ([len = 0xffff_ffff_ffff_ffff]), it is even possible
	   to create a snapshot view of the whole file. The snapshot
	   persists for the duration of the transaction.

	   By passing [seqno > 0], the procedure is only successful if
	   the file has still this sequence number. If not, the error
	   code [ECONFLICT] is returned.
	*/

	/** {2:allocate_blocks [allocate_blocks]} */

	rblocklist allocate_blocks(trans_id, hyper, hyper, hyper, bool,
				   longstrings) = 9;
	/** [allocate_blocks(tid, inode, index, len, set_mtime, data_pref)]: 
           Allocate new blocks
           for the range [index] to [index+len-1]. Old blocks in this
           range are freed. Blocks are allocated on various nodes
           respecting the replication policy. The inode is locked.

	   [set_mtime]: Whether to set [mtime] to the server time.

	   [data_pref]: These datanodes are preferred for storing the
	   blocks. The nodes must be given as identity names.
	   This list is only a suggestion. For every block it is tried
	   to allocate it on one of the preferred nodes, even if the
	   allocation becomes unbalanced. However, if it is not possible
	   to follow the suggestion it is ignored. If a node name
	   cannot be identified, the element of [data_pref] is silently
	   ignored. This parameter is mostly useful to make it highly
	   likely that blocks are stored locally - on the same machine
	   as the machine runnning the requesting client.
	   
           It is only allowed to allocate blocks for regular files.

	   Note that the last possible block has the number 
	   0x7fff_ffff_ffff_fffe corresponding to a maximum file length
	   of 0x7fff_fff_fff_ffff blocks.
	*/

	/** {2:free_blocks [free_blocks]} */

	rvoid free_blocks(trans_id, hyper, hyper, hyper, bool) = 10;
	/** [free_blocks(tid, inodenr, index, len, set_mtime)]: Frees the blocks
           [index] to [index+len-1]. It is not an error to free a block
           that was not allocated. The inode is locked.

	   [set_mtime]: Whether to set [mtime] to the server time.
	*/
	
	/** {2 statistics} */

	/** {2 [get_fsstat]} */

	rfsstat get_fsstat(void) = 11;

	/** {2 replication control} */

	/** {2 [rereplicate]} */

	rvoid rereplicate(trans_id, hyper) = 12;
	/* replicate(inode): Fixes the replication */
	/* not yet implemented */

	/** {2 Misc} */

	/** {2 [get_blocksize]} */

	int get_blocksize(void) = 13;
	/** returns the blocksize */

	/** {2 [get_params]} */

	params get_params(void) = 35;
	/** returns some parameters as (name,value) tuples:
	    - [clustername]: the name of the cluster
	    - [coordinator]: the host name of the coordinator
	    - [blocksize]: the value is the blocksize as decimal number
	    - [lock_timeout]: for how long transactions should be repeated
	      after [ECONFLICT]
	    - [replication]: the default replication factor
	    - [data_security_level]: how to access the datanodes:
	      "none", "auth", "int", "priv"
	    - [data_timeout]: the timeout in seconds for accessing the data
	      nodes (integer decimal number)
	*/

	/** {2 [get_dn_info]} */

	dn_info_list get_dn_info(void) = 36;
	/** returns a list of records describing the datanodes with
	    - identities
	    - hosts and ports
	    - size
	*/

        /** {2 Directories and filenames} */

	/** An existing inode can be connected with a filename. Filenames
           are as in Unix (slash-separated). All filenames must start with
           a slash.

           When a filename is created, the parent directory must already
           exist (in the [link] operation). For regular files and symlinks
           it is allowed that the inode is connected to several filenames.

           With [unlink] the filename is deleted. Unlike in [Unix] the
           last [unlink] operation for an inode does not delete the inode
           automatically. The delete is delayed until the transaction
	   is committed. (So a file can be renamed by first unlinking the
	   old name, and then linking the new name.)

           For directories, it is required that the directory
           is empty before [unlink]. It is not possible to delete "/".

	   There are three kinds of locks for filenames:
	   {ul
	   {- {i Existence locks}: This means that the filename must not
	     be deleted by a competing transaction. Existence locks are
	     non-exclusive, i.e. several transaction can hold them for the
	     same name.}
	   {- {i Unlink locks}: This is the counterpart - this lock
	     means that the filename is going to be deleted. Unlink
	     locks are exclusive.}
	   {- {i Creation locks}: This type of lock means that a filename
	     is being created in an exclusive way. This lock is also
	     exclusive.}
           }

	   The locks are not granted for the whole path under which a
	   file is known, but only for the last component of the path
	   relative to the containing directory. The directory can be
	   moved.

	   For example, assume there is a directory [/dir]. Transaction 1
	   creates a file in this directory [/dir/file]. While the
	   transaction is open, an existence lock on [/dir] and a
	   creation lock on [/dir/file] are held. A competing transaction 2
	   tries to delete the directory [/dir]. Of course, both transactions
	   cannot be committed together - they are logically inconsistent.
	   What actually happens, depends on the order of the operations:
	   If the file is created first, transaction 1 gets all its locks,
	   and transaction 2 fails when trying to get an unlink lock
	   for [/dir]. If the deletion occurs first, transaction 2 gets
	   the unlink lock on [/dir], and transaction 1 fails to acquire
	   the existence lock on [/dir].
	*/

	/** {2:lookup [lookup]} */

	rhyper lookup(trans_id, hyper, longstring, bool) = 14;
	/** [lookup(tid, dir_inode, path, symbolic)]: This RPC is used to look up
	    file names and path names. Possible data cases:

	    {ul
	    {- [path] is an absolute path, and [dir_inode=(-1)]: This looks
	      up the path and returns the inode.}
	    {- [path] is a relative path, and [dir_inode] is a real
	      inode of a directory: This walks down the path starting
	      at [dir_inode].}
            }

	   The lookup resolves symbolic links. If the [symbolic] flag is
	   true, the last component of [path] is excluded from symbolic
	   link resolution.

	    This RPC does not acquire locks.
	 */

	/** {2 [rev_lookup]} */

	rlongstrings rev_lookup(trans_id, hyper) = 15;
	/** [rev_lookup(tid, inode)]:

	    Reverse lookup for this inode: Returns the connected filenames.

	    This RPC does not acquire locks.
	*/

	/** {2 [rev_lookup_dir]} */

	rlongstring rev_lookup_dir(trans_id, hyper) = 16;
	/** [rev_lookup_dir(tid, inode)]:

	    Reverse lookup for this inode: If the inode is a directory
            returns the absolute name of the directory. Otherwise
	    an error is returned ([ENOIENT] if the inode does not exist,
	    or [EFHIER] if the inode is not a directory).

	    The returned name is existence-locked.
	*/

	/** {2 [link_count]} */

	rint link_count(trans_id, hyper) = 17;
	/** [link_count(tid, inode)]: 

	    Returns the number of filenames linked with this inode.
	    Note that this number is not what Unix puts into the [nlink]
	    field of a stat, because PlasmaFS does not create links for
	    "." and "..".

	    This RPC does not acquire locks.
	*/

	/** {2:link [link]} */

        rvoid link(trans_id, longstring, hyper) = 18;
	/** [link(tid, path, inode)]:
           Creates this (absolute) filename, and links it with this inode.

	   This implicitly sets the ctime of the inode and the mtime
	   of the directory inode to the current server time.

	   An existence lock on the directory containing the new file,
	   and a creation lock for the file are obtained. Note that
	   it is required that the directory is also linked in the
	   file hierarchy (i.e. you cannot put new links into directories
	   without name).
	 */

	rvoid link_at(trans_id, hyper, longstring, hyper) = 19;
	/** [link_at(tid, dir_inode, name, inode)]:
	    Creates a new [name] in the directory referenced by [dir_inode].
	    The name is connected with [inode].
	*/

	/** {2:unlink [unlink]} */

        rvoid unlink(trans_id, longstring) = 20;
	/** [unlink(tid, path)]: Deletes this filename

	   This implicitly sets the ctime of the inode and the mtime
	   of the directory inode to the current server time.

	   This locks the inode, and acquires an unlink lock for the
	   path.

	   If the number of links for the inode drops to 0 at commit
	   time, the inode is implicitly deleted.
	*/

	rvoid unlink_at(trans_id, hyper, longstring) = 21;
	/** [unlink_at(tid, dir_inode, name)]: Deletes the member [name]
	    from the directory referenced by [dir_inode].
	*/

	/** {2:list [list]} */

	rentries list(trans_id, hyper) = 22;
	/** [list(tid, inode)]:
	    Lists the contents of the directory. Only a single directory
            can be listed (no recursion). The RPC returns the basenames
            of the contained files only (path information stripped)

	   The path name of the directory is existende-locked.
	*/

	/** {2:rename [rename]} */

	rvoid rename(trans_id, longstring, longstring) = 23;
	/** [rename(tid, old_path, new_path)]:
	    Renames [old_path] into [new_path]. It is required that
	    [new_path] does not yet exist.
	*/

	rvoid rename_at(trans_id, hyper, longstring, hyper, longstring) = 24;
	/** [rename_at(tid, old_dir_inode, old_name, new_dir_inode, new_name)]:
	    Renames the path identified by [old_name] in [old_dir_inode]
	    into the path identified by [new_name] in [new_dir_inode].
	*/

	/** {2:namelock [namelock]} */
	
	rvoid namelock(trans_id, hyper, longstring) = 25;
	/** [namelock(tid, dir_inode, name)]: Acquires an existence lock
	    on the member [name] of the directory referenced by [dir_inode].
	    This means that a competing transaction cannot delete this name,
	    or rename it. The protection is valid until the end of the
	    transaction.

	    Note that it is also not possible to delete the containing 
	    directory, i.e. [dir_inode], because only empty directories
	    can be deleted. However, this directory can be renamed and
	    moved away. So this lock does not protect against changes of
	    the path under which [dir_inode] is known.

	    Fails with [ECONFLICT] if the lock cannot be acquired.

	    It is required that the directory exists, and that there is
	    a member [name]. Otherwise [ENOENT] is returned.
	*/

	/** {2 [set_block_checksum]} */

	/** The checksums are not automatically set.
           The client has to call [set_block_checksum] for every written
	   block.
	*/

	rvoid set_block_checksum(trans_id, hyper, hyper, longstring) = 26;
	/** [set_block_checksum(tid, inode, index, checksum)] */

	/** {2 [iterate]} */

	rhypers iterate(trans_id, hyper, int) = 27;
	/** iterate by inode: iterate(inode, n) returns the up to n smallest
           inodes that are larger than [inode]. (Privileged operation.)
	*/
	/* not yet implemented */

	/** {2 Authentication and authorization} */

	/** {2:impersonate [impersonate]} */

	bool impersonate(longstring, longstring, longstrings, longstring_opt, bool) = 28;
	/** [impersonate(user,group,supp_groups,auth_ticket_opt,delete)]

	    Become a different user. By default, the file operations are done
	    as the user ID that was used to authenticate on the RPC level
	    (or "pnobody" if anonymous access is permitted). This operation
	    changes this for the lifetime of the TCP connection. This is only
	    allowed if there is currently no transaction.

	    Unless the current user ID is "proot" or already [user] on
	    the RPC level, one needs an authentication ticket to prove
	    that the operation is permitted. The ticket is automatically
	    deleted when [delete] is true (saves memory).

	    Returns [true] on success, and [false] on error.
	 */
	
	/** {2:get_auth_ticket [get_auth_ticket]} */

	longstring_opt get_auth_ticket(longstring) = 29;
	/** [get_auth_ticket(user)]

	    Generates a new authentication ticket for [user]. The ticket
	    can be passed to [impersonate] to restore the rights of [user]
	    even if one is not logged in as this user on the RPC level.
	    The ticket has a limited lifetime only.

	    Only [user] or "proot" is allowed to create such a ticket for
	    [user].

	    Returns NULL if the operation is not permitted.
	 */

	/** {2:renew_auth_ticket [renew_auth_ticket]} */

	void renew_auth_ticket(longstring) = 30;
	/** [renew_auth_ticket(ticket)] 

	    Extends the lifetime of the passed ticket. Note that there is
	    intentionally no return value.
	 */

	/** {2:read_admin_table [read_admin_table]} */

	rlongstring read_admin_table(longstring) = 31;
	/** [read_admin_table(key)]

	    Returns the contents of the admin table [key] as a single string
	    (which is actually a line-structured text file).

	    Currently defined tables: "passwd", and "group".
	*/

	/** {2:write_admin_table [write_admin_table]} */

	rvoid write_admin_table(longstring,longstring) = 32;
	/** [write_admin_table(key,contents)]

	    Sets the contents of the admin table [key] as a single string
	    (which is actually a line-structured text file).

	    Currently defined tables: "passwd", and "group". Only the user
	    "proot" can modify these tables.

	    This operation is not transactional.
	*/


	/** {2:inodecache The inode cache} */

	/* The inodecache can quickly determine whether an [inodeinfo]
	   is still up to date, or whether the sequence number of the
	   [inodeinfo] is still up to date. This is faster than a regular
	   [get_inodeinfo] because this can happen
	   outside a transaction, and because often no database query is
	   required.
	*/

	/** {2 [is_up_to_date] } */

	bool is_up_to_date(hyper, inodeinfo) = 33;
	/** [is_up_to_date(inode,ii)]: Checks whether [ii] is the current
	   version of the inode metadata for [inode]. Returns [true]
	   if this was the case at the moment the RPC was sent by
	   the caller.

	   Returns [false] if the inode is not known, if an error
	   occurs, or if it cannot be quickly determined that the inode
	   is actually up to date. So [false] does not necessarily imply
	   that [ii] is out of date. In this case, the client should
	   use alternate means of checking this.
	*/

	/** {2:is_up_to_date_seqno [is_up_to_date_seqno] } */

	bool is_up_to_date_seqno(hyper, hyper) = 34;
	/** [is_up_to_date(inode,seqno)]: Same check but only for the
	   sequence number of the inode
	*/



    } = 1;
} = 0x8000e001;

#endif

/** */

This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml