/* $Id: pfs_types.x 274 2010-10-24 15:04:09Z gerd $ -*- c -*- */Types for the RPC interfaces
For users of the Plasma_client
module: The types defined
here are mapped to corresponding Ocaml types, and exported via
the Plasma_rpcapi_aux
module.
Within the server, however, the mappings of Pfs_rpcapi_aux
are used. (These mappings differ in some minor points from the ones
used for the client.)
#ifndef PFS_TYPES_X #define PFS_TYPES_X
longstring
typedef string longstring<>;A string up to 4G length
longstrings
typedef longstring longstrings<>;An array of longstrings
longstring_opt
typedef longstring *longstring_opt;A longstring option
hypers
typedef hyper hypers<>;An array of hypers
trans_id
typedef hyper trans_id;transaction IDs can be used to run several transactions over the same TCP connection
ug
struct ug { longstring user; longstring group; };Users and groups are given by name
time
struct time { hyper tsecs; /* Seconds since the epoch... */ int tnsecs; /* plus these nanoseconds */ };
tsecs
and tnsecs
must be non-negative; tnsecs < 1E9
. In
the filesystem procedure update_inodeinfo
a negative tsecs
is interpreted as "set the time to the current server
time"
time_opt
typedef time *time_opt;an optional time struct
ftype_enum
enum ftype_enum { FTYPE_REGULAR = 0, FTYPE_DIRECTORY = 1, FTYPE_SYMLINK = 2 };File types
ftype
union ftype switch(ftype_enum d) { case FTYPE_REGULAR: void; default: void; };File types as union
blockinfo
blockinfo
says where the n-th block of a file is stored on a datanode.
The number n
is called the block index (starting at 0).
The datanode location is given by the identity of the datanode,
and the block number of the datanode. Block numbers count from 0
to s-1
when s
is the number of blocks a datanode stores.
In blockinfo
there is also the information to which machine the
identity of the datanode is assigned, and whether the machine is
alive. This is purely informational, and is intended to ease the
implementation of clients.
Checksums are not yet implemented.
The sequence number of the inode is increased whenever new data
is written. It is also included in blockinfo
to simplify the
implementation of caches.
Safe transactions: The numbers safetrans_id
and safetrans_vfy
need to be passed on to the datanode
in order to read or write the block. The numbers are only valid for
the current transaction, and only for this block. The numbers form
together a ticket that allows accesses to the block.
(Actually, the datanode
only checks for writes whether the client has a valid ticket, but
not for reads. This might change in the future, though.)
struct blockinfo { hyper index; /* block index */ longstring node; /* datanode server as "host:port" ("" if not known) */ longstring identity; /* datanode server as identity string */ hyper block; /* block number on this node */ bool node_alive; /* informational: whether the node is alive */ longstring *checksum; /* optional checksum */ hyper inode_seqno; /* current seqno of the inode */ bool inode_committed; /* whether [inode_seqno] is a committed version */ hyper safetrans_id; /* safetrans ticket, first part */ hyper safetrans_tmo; /* safetrans ticket, timeout */ hyper safetrans_vfy; /* safetrans ticket, second part */ };
safetrans_*
: For securing the communication with the datanode.
The blocks are accessible for a limited period of time only.
The safetrans_id
identifies the datanode transaction. safetrans_tmo
is the point in time when the access times out. After that
the data nodes will not accept writes to the blocks any longer.
The verifier safetrans_vfy
is a hash value built from the
safetrans_id
and the block number, and is used by the data
node to check that only accessible blocks are written:
safetrans_vfy=extract_64_bits(MD5(safetrans_id ^ "/" ^
safetrans_secret ^ "/" ^ block))
Usually, the safetrans feature is only used for securing block writes. The protocol would also allow it to use it for reads, though, and compatible clients should assume this.
blocklist
blockinfo
structs for a block index say that the
datanode is down (a broken file)inode_seqno
and inode_committed
fields have all the same
values. This is not broken down per block (it would be possible
that these values "remember" the sequence number when the block
was first committed, resulting in finer granularity of the
information.)
typedef blockinfo blocklist<>;
inodeinfo
inodeinfo
is what is stored for an inode. Documentation is
inline below. Note that inodeinfo
structs may be passed from
the server to the client, and from the client to the server.
In the latter case, the client may not know all fields, or
may use special values in fields.
struct inodeinfo { ftype filetype;
ug usergroup;
int mode;
hyper eof;
eof
value is seen as a convention only. The server never
automatically changes it when blocks are allocated or freed.
This means eof
can be set to a position before the last
block or after the last block. It is just the interpretation
of the user to use this number as eof
position.
Conventionally, eof
is only meaningful for regular files.
time mtime; time ctime;
mtime
of
the directory. See the documentation for time
how clients
can request that the server fills in its own current time.
int replication;
Replication is only meaningful for regular files.
hyper blocklimit;
blocklimit
on are not allocated. This
field cannot be set by clients - the field value is ignored.
Note that this is totally unrelated to eof
which can be set to
any value independent on how many blocks are allocated.
Also, there may be holes in the file before blocklimit
.
longstring field1;
hyper seqno;
seqno
values are only valid within the transaction.
It is generally possible that seqno
is set to values that were
already generated for previous aborted transactions.
The seqno
makes it possible to
easily check for any file modification. This field is guaranteed
to change for every data or metadata modification.
bool committed;
inodeinfo
struct
is committed data. If false, the struct has been modified by the
transaction. This flag gives valuable information for deciding
whether the struct can be cached or not. This field is automatically
maintained and cannot be set directly.
hyper create_verifier;
};
entry
struct entry { longstring entry_name; /* basename of a file in a directory */ hyper entry_inode; /* inode of this file */ };
entries
typedef entry entries<>;
fsstat
struct fsstat { hyper total_blocks; hyper used_blocks; hyper trans_blocks;
bool have_block_checksums;
bool have_safetrans_for_reads;
bool have_protected_inodes;
Filesystem
users may only pass inode numbers to RPC's that
have been returned by previously called RPC's in the same transaction
};
errno_code
enum errno_code { OK = 0, ENOTRANS = 1, /* no transaction */ EFAILEDCOMMIT = 2, /* general commit error */ ELONGTRANS = 3, /* transaction too long */ EFAILED = 4, /* general error */ EPERM = 5, /* not owner, or op is otherwise not permitted */ ENOENT = 6, /* No such file or directory */ EACCESS = 7, /* Permission denied */ EEXIST = 8, /* File exists */ EFHIER = 9, /* File hierarchy violation (e.g. move a directory into its own subdirectory) */ EINVAL = 10, /* invalid argument */ EFBIG = 11, /* file too big */ ENOSPC = 12, /* no space left */ EROFS = 13, /* read-only filesystem */ ENAMETOOLONG = 14, /* filename too long */ ECONFLICT = 15, /* update conflicts with another transaction */ ECOORD = 16, /* this is not the coordinator */ ENONODE = 17, /* unknown node */ ETBUSY = 18, /* transaction is busy (last command not finished) */ ESTALE = 19, /* no such inode */ EIO = 20, /* datanode error, not enough datanodes */ ELOOP = 21, /* looping symlinks */ ENOTDIR = 22, /* operation can only be done for directory */ EISDIR = 23, /* operation can only be done for non-directory */ ENOTEMPTY = 24, /* directory is non-empty but need to be */ EBADPATH = 25 /* a path component is not a directory (POSIX sees this also as ENOTDIR) */ };
Filesystem
RPC's. These results are always unions of the possible error codes
with the special value OK
. For OK
, a value of some type is
returned as result value, and this type is the second parameter.
#define MK_RESULT_TYPE(name,type) \ union name switch(errno_code d) { \ case OK: \ type; \ default: \ void; \ }Creates the types:
rvoid
rinodeinfo
rblocklist
rfsstat
rint
rhyper
rhypers
rlongstring
rlongstrings
rentries
MK_RESULT_TYPE(rvoid,void); MK_RESULT_TYPE(rinodeinfo,inodeinfo t); MK_RESULT_TYPE(rblocklist,blocklist t); MK_RESULT_TYPE(rfsstat,fsstat t); MK_RESULT_TYPE(rint,int t); MK_RESULT_TYPE(rhyper,hyper t); MK_RESULT_TYPE(rhypers,hypers t); MK_RESULT_TYPE(rlongstring,longstring t); MK_RESULT_TYPE(rlongstrings,longstrings t); MK_RESULT_TYPE(rentries,entries t); /* Revision numbers have the format: YYYYMMDDHHMMSSUUUUUU:<random hex digits> It is meaningful to sort revision numbers. */ #ifdef SERVER_CONTEXT
readdata
in server context
typedef string readdata<>;
writedata
in server context
typedef _managed string writedata<>;(A managed string is represented differently in Ocamlnet's language mapping layer.)
announcement
in server context
struct announcement { longstring ann_clustername; /* clustername */ longstring ann_sender; /* sender host:port */ longstrings ann_eligible; /* list of hosts that are eligible (host:port syntax) */ longstring ann_revision; /* the revision number of the sender */ longstring ann_rank; /* configured rank */ }; #else
readdata
in client context
typedef _managed string readdata<>;
writedata
in client context
typedef _managed string writedata<>; #endif
dn_channel_enum
More methods may be defined in the future.
enum dn_channel_enum { DNCH_RPC = 0, /* the data is embedded into the RPC channel */ DNCH_SHM = 1 /* the data is exchanged via a POSIX shm object */ };
dn_channel_shm_obj
struct dn_channel_shm_obj { longstring shm_path; /* must be a path for POSIX shm */ hyper shm_offset; /* the offset to the start in the file. */ int shm_length; /* the length of the object */ };
dn_channel_rd_req
union dn_channel_rd_req switch (dn_channel_enum d) { case DNCH_RPC: void; case DNCH_SHM: dn_channel_shm_obj ch; };
dn_channel_rd_data
union dn_channel_rd_data switch (dn_channel_enum d) { case DNCH_RPC: readdata data; case DNCH_SHM: void; };
dn_channel_wr_data
union dn_channel_wr_data switch (dn_channel_enum d) { case DNCH_RPC: writedata data; case DNCH_SHM: dn_channel_shm_obj ch; }; #endif