/* $Id: pfs_types.x 235 2010-06-20 22:23:05Z gerd $ -*- c -*- */
/** Types for the RPC interfaces
*/
/** {b For users of the {!Plasma_client} module:} The types defined
here are mapped to corresponding Ocaml types, and exported via
the {!Plasma_rpcapi_aux} module.
*/
/** {b Within the server,} however, the mappings of {!Pfs_rpcapi_aux}
are used. (These mappings differ in some minor points from the ones
used for the client.)
*/
#ifndef PFS_TYPES_X
#define PFS_TYPES_X
/** {2 [longstring]} */
typedef string longstring<>;
/** A string up to 4G length */
/** {2 [longstrings]} */
typedef longstring longstrings<>;
/** An array of longstrings */
/** {2 [longstring_opt]} */
typedef longstring *longstring_opt;
/** A longstring option */
/** {2 [hypers]} */
typedef hyper hypers<>;
/** An array of hypers */
/** {2 [trans_id]} */
typedef hyper trans_id;
/** transaction IDs can be used to run several transactions over the same
TCP connection
*/
/** {2 [ug]} */
struct ug {
longstring user;
longstring group;
};
/** Users and groups are given by name */
/** {2 [time] } */
struct time {
hyper tsecs; /* Seconds since the epoch... */
int tnsecs; /* plus these nanoseconds */
};
/** [tsecs] and [tnsecs] must be non-negative; [tnsecs < 1E9]. In
the filesystem procedure [update_inodeinfo]
a negative [tsecs] is interpreted as "set the time to the current server
time"
*/
/** {2 [time_opt]} */
typedef time *time_opt;
/** an optional time struct */
/** {2 [ftype_enum]} */
enum ftype_enum {
FTYPE_REGULAR = 0,
FTYPE_DIRECTORY = 1,
FTYPE_SYMLINK = 2
};
/** File types */
/** {2 [ftype]} */
union ftype switch(ftype_enum d) {
case FTYPE_REGULAR:
void;
default:
void;
};
/** File types as union */
/** {2:blockinfo [blockinfo]} */
/** [blockinfo] says where the n-th block of a file is stored on a datanode.
The number [n] is called the {i block index} (starting at 0).
The datanode location is given by the {i identity} of the datanode,
and the {i block number} of the datanode. Block numbers count from 0
to [s-1] when [s] is the number of blocks a datanode stores.
In [blockinfo] there is also the information to which machine the
identity of the datanode is assigned, and whether the machine is
alive. This is purely informational, and is intended to ease the
implementation of clients.
Checksums are not yet implemented.
The sequence number of the inode is increased whenever new data
is written. It is also included in [blockinfo] to simplify the
implementation of caches.
Safe transactions: The numbers [safetrans_id] and [safetrans_vfy]
need to be passed on to the datanode
in order to read or write the block. The numbers are only valid for
the current transaction, and only for this block. The numbers form
together a ticket that allows accesses to the block.
(Actually, the datanode
only checks for writes whether the client has a valid ticket, but
not for reads. This might change in the future, though.)
*/
struct blockinfo {
hyper index; /* block index */
longstring node; /* datanode server as "host:port" ("" if not known) */
longstring identity; /* datanode server as identity string */
hyper block; /* block number on this node */
bool node_alive; /* informational: whether the node is alive */
longstring *checksum; /* optional checksum */
hyper inode_seqno; /* current seqno of the inode */
hyper safetrans_id; /* safetrans ticket, first part */
hyper safetrans_tmo; /* safetrans ticket, timeout */
hyper safetrans_vfy; /* safetrans ticket, second part */
};
/** {3 Safe transactions} */
/** [safetrans_*]: For securing the communication with the datanode.
The blocks are accessible for a limited period of time only.
The [safetrans_id] identifies the datanode transaction. [safetrans_tmo]
is the point in time when the access times out. After that
the data nodes will not accept writes to the blocks any longer.
The verifier [safetrans_vfy] is a hash value built from the
[safetrans_id] and the block number, and is used by the data
node to check that only accessible blocks are written:
{[safetrans_vfy=extract_64_bits(MD5(safetrans_id ^ "/" ^
safetrans_secret ^ "/" ^ block))]}
Usually, the safetrans feature is only used for securing
block writes. The protocol would also allow it to use it for
reads, though, and compatible clients should assume this.
*/
/** {2:blocklist [blocklist]} */
/** Block lists describe where the blocks of a file are stored.
Note that these phenomenons can occur:
- the same block index can occur several times (replicas)
- a certain block index does not occur at all (a file hole)
- all [blockinfo] structs for a block index say that the
datanode is down (a broken file)
*/
typedef blockinfo blocklist<>;
/** {2:inodeinfo [inodeinfo]} */
/** [inodeinfo] is what is stored for an inode. Documentation is
inline below. Note that [inodeinfo] structs may be passed from
the server to the client, and from the client to the server.
In the latter case, the client may not know all fields, or
may use special values in fields.
*/
struct inodeinfo {
ftype filetype;
/** {ul {- The file type. Some fields are only meaningful for certain
types.}}
*/
ug usergroup;
/** {ul {- The owner}}
*/
int mode;
/** {ul {- File permission bits}}
*/
hyper eof;
/** {ul {- The [eof] value is seen as a convention only. The server never
automatically changes it when blocks are allocated or freed.
This means [eof] can be set to a position before the last
block or after the last block. It is just the interpretation
of the user to use this number as [eof] position.
Conventionally, [eof] is only meaningful for regular files.}}
*/
time mtime;
time ctime;
/** {ul {- Time fields are not automatically maintained, except that a
link or unlink operation implicitly updates the [mtime] of
the directory. See the documentation for [time] how clients
can request that the server fills in its own current time.}}
*/
int replication;
/** {ul {- The replication factor the file ought to have. Clients can set it to
0 to request the default replication factor. Values returned from
the server are always >= 1.
Replication is only meaningful for regular files.}}
*/
hyper blocklimit;
/** {ul {- The blocks from the index [blocklimit] on are not allocated. This
field cannot be set by clients - the field value is ignored.
Note that this is totally unrelated to [eof] which can be set to
any value independent on how many blocks are allocated.
Also, there may be holes in the file before [blocklimit].}}
*/
longstring field1;
/** {ul {- For symlinks this is the target. For other types this field has no
defined meaning }}
*/
hyper seqno;
/** {ul {- This is increased by the server whenever blocks are allocated or
freed, i.e for every content modification of the file. It is not
possible to change this field. This means one
can easily check for any file modification by comparing the
inodeinfo fields which are guaranteed to change for every data
or metadata modification.}}
*/
hyper create_verifier;
/** {ul {- intended use as NFSv3 create verifier. Set to 0 outside NFS scope.}}
*/
};
/** {2 [entry]} */
/** Entries of directories */
struct entry {
longstring entry_name; /* basename of a file in a directory */
hyper entry_inode; /* inode of this file */
};
/** {2 [entries]} */
typedef entry entries<>;
/** {2 [fsstat]} */
struct fsstat {
hyper total_blocks;
hyper used_blocks;
hyper trans_blocks;
/** {ul {- Blocks in a transitional phase: these are allocated by a transaction,
but the transaction is not yet committed}}
*/
bool have_block_checksums;
/** {ul {- Whether this feature is enabled}} */
bool have_safetrans_for_reads;
/** {ul {- whether safetrans ID's are required for read access}} */
bool have_protected_inodes;
/** {ul {- whether [Filesystem] users may only pass inode numbers to RPC's that
have been returned by previously called RPC's in the same transaction}}
*/
};
/** {2 [errno_code]} */
enum errno_code {
OK = 0,
ENOTRANS = 1, /* no transaction */
EFAILEDCOMMIT = 2, /* general commit error */
ELONGTRANS = 3, /* transaction too long */
EFAILED = 4, /* general error */
EPERM = 5, /* not owner, or op is otherwise not permitted */
ENOENT = 6, /* No such file or directory */
EACCESS = 7, /* Permission denied */
EEXIST = 8, /* File exists */
EFHIER = 9, /* File hierarchy violation */
EINVAL = 10, /* invalid argument */
EFBIG = 11, /* file too big */
ENOSPC = 12, /* no space left */
EROFS = 13, /* read-only filesystem */
ENAMETOOLONG = 14, /* filename too long */
ECONFLICT = 15, /* update conflicts with another transaction */
ECOORD = 16, /* this is not the coordinator */
ENONODE = 17, /* unknown node */
ETBUSY = 18, /* transaction is busy (last command not finished) */
ENOIENT = 19, /* no such inode */
EIO = 20 /* datanode error, not enough datanodes */
};
/** {2 Result types of RPC's} */
/** This macro is used for creating the result types of [Filesystem]
RPC's. These results are always unions of the possible error codes
with the special value [OK]. For [OK], a value of some type is
returned as result value, and this type is the second parameter.
*/
#define MK_RESULT_TYPE(name,type) \
union name switch(errno_code d) { \
case OK: \
type; \
default: \
void; \
}
/** Creates the types:
- [rvoid]
- [rinodeinfo]
- [rblocklist]
- [rfsstat]
- [rint]
- [rhyper]
- [rhypers]
- [rlongstrings]
- [rentries]
*/
MK_RESULT_TYPE(rvoid,void);
MK_RESULT_TYPE(rinodeinfo,inodeinfo t);
MK_RESULT_TYPE(rblocklist,blocklist t);
MK_RESULT_TYPE(rfsstat,fsstat t);
MK_RESULT_TYPE(rint,int t);
MK_RESULT_TYPE(rhyper,hyper t);
MK_RESULT_TYPE(rhypers,hypers t);
MK_RESULT_TYPE(rlongstrings,longstrings t);
MK_RESULT_TYPE(rentries,entries t);
/* Revision numbers have the format:
YYYYMMDDHHMMSSUUUUUU:<random hex digits>
It is meaningful to sort revision numbers.
*/
#ifdef SERVER_CONTEXT
/** {2 [readdata] in server context} */
typedef string readdata<>;
/** {2 [writedata] in server context} */
typedef _managed string writedata<>;
/** (A managed string is represented differently in Ocamlnet's language
mapping layer.)
*/
/** {2 [announcement] in server context} */
struct announcement {
longstring ann_clustername;
/* clustername */
longstring ann_sender;
/* sender host:port */
longstrings ann_eligible;
/* list of hosts that are eligible (host:port syntax) */
longstring ann_revision;
/* the revision number of the sender */
longstring ann_rank;
/* configured rank */
};
#else
/** {2 [readdata] in client context} */
typedef _managed string readdata<>;
/** {2 [writedata] in client context} */
typedef _managed string writedata<>;
#endif
/** {2 [dn_channel_enum] } */
/** Block data can be exchanged with the datanode servers on two ways:
+ The block data is included in the normal RPC call as string
+ The block data is put into a shared memory object
Of course, the second method works only if the client and the server
are on the same node. Also, the client needs to invoke the server RPC
via a Unix Domain socket, and not via TCP.
More methods may be defined in the future.
*/
enum dn_channel_enum {
DNCH_RPC = 0, /* the data is embedded into the RPC channel */
DNCH_SHM = 1 /* the data is exchanged via a POSIX shm object */
};
/** {2:dn_channel_shm_obj [dn_channel_shm_obj] } */
/** This struct identifies a shared memory object */
struct dn_channel_shm_obj {
longstring shm_path; /* must be a path for POSIX shm */
hyper shm_offset; /* the offset to the start in the file. */
int shm_length; /* the length of the object */
};
/** {2 [dn_channel_rd_req] } */
/** This is the argument for data reads. The client can request the
data exchange method. For shared memory, the client also has to
say which shared memory object will receive the data.
*/
union dn_channel_rd_req switch (dn_channel_enum d) {
case DNCH_RPC:
void;
case DNCH_SHM:
dn_channel_shm_obj ch;
};
/** {2 [dn_channel_rd_data] } */
/** This is the return value of the data server for a read request.
If the data is included in the RPC message, it follows now.
If the data is put into shared memory, the client can now expect it
to be there.
*/
union dn_channel_rd_data switch (dn_channel_enum d) {
case DNCH_RPC:
readdata data;
case DNCH_SHM:
void;
};
/** {2 [dn_channel_wr_data] } */
/** For write requests, the client either includes the data directly
in the message, or it has already put it into a shared memory
objects, and only includes the information where
*/
union dn_channel_wr_data switch (dn_channel_enum d) {
case DNCH_RPC:
writedata data;
case DNCH_SHM:
dn_channel_shm_obj ch;
};
#endif