/* $Id: pfs_types.x 488 2011-10-25 21:29:23Z gerd $ -*- c -*- */Types for the RPC interfaces
For users of the Plasma_client
module: The types defined
here are mapped to corresponding Ocaml types, and exported via
the Plasma_rpcapi_aux
module.
Within the server, however, the mappings of Pfs_rpcapi_aux
are used. (These mappings differ in some minor points from the ones
used for the client.)
#ifndef PFS_TYPES_X #define PFS_TYPES_X
longstring
typedef string longstring<>;A string up to 4G length
longstrings
typedef longstring longstrings<>;An array of longstrings
longstring_opt
typedef longstring *longstring_opt;A longstring option
mediumstring
typedef string mediumstring<4096>;
mediumstrings
typedef mediumstring mediumstrings<4096>;
hypers
typedef hyper hypers<>;An array of hypers
trans_id
typedef hyper trans_id;transaction IDs can be used to run several transactions over the same TCP connection
ug
struct ug { mediumstring user; mediumstring group; };Users and groups are given by name
time
struct time { hyper tsecs; /* Seconds since the epoch... */ int tnsecs; /* plus these nanoseconds */ };
tsecs
and tnsecs
must be non-negative; tnsecs < 1E9
. In
the filesystem procedure update_inodeinfo
a negative tsecs
is interpreted as "set the time to the current server
time"
time_opt
typedef time *time_opt;an optional time struct
ftype_enum
enum ftype_enum { FTYPE_REGULAR = 0, FTYPE_DIRECTORY = 1, FTYPE_SYMLINK = 2 };File types
ftype
union ftype switch(ftype_enum d) { case FTYPE_REGULAR: void; default: void; };File types as union
ticket
ticket
is handed out for blocks or block ranges, and permits
read or write access on a datanode. The ticket is valid for a
single datanode only, and only for the blocks range_start
to
range_start+range_length-1
. The validity of the ticket is further
restricted: It is revoked when the current transaction ends. Also,
it is revoked when the time safetrans_tmo
is reached.
The safetrans_vfy
is a cryptographically computed signature.
struct ticket { hyper range_start; /* First block */ hyper range_length; /* number of blocks */ hyper safetrans_id; /* safetrans ticket, first part */ hyper safetrans_tmo; /* safetrans ticket, timeout */ hyper safetrans_vfy; /* safetrans ticket, second part */ bool read_perm; /* whether read permission is granted */ bool write_perm; /* whether write permission is granted */ };
safetrans_id
identifies the datanode transaction. safetrans_tmo
is the point in time when the access times out. After that
the data nodes will not accept access to the blocks any longer.
The verifier safetrans_vfy
is a hash value built from the
other information, and is used by the data
node to check that only accessible blocks are written:
safetrans_vfy=extract_64_bits(MD5(safetrans_id ^ "/" ^
safetrans_secret ^ "/" ^ range_start ^ "/" ^ range_length ^ "/" ^
read_perm ^ "/" ^ write_perm))
(Numbers converted to string via Int64.to_string
, and booleans via
string_of_bool
.)
Usually, the safetrans feature is only used for securing block writes. The protocol would also allow it to use it for reads, though, and compatible clients should assume this.
blockinfo
blockinfo
says where the n-th block of a file is stored on a datanode.
The number n
is called the block index (starting at 0).
The datanode location is given by the identity of the datanode,
and the block number of the datanode. Block numbers count from 0
to s-1
when s
is the number of blocks a datanode stores.
In order to get some compression, adjacent blocks can share the
same blockinfo
. In this case, the index and block number in
blockinfo
refer to the first block of a range, and the length
field denotes how long this range is. This method of compression is
only used when all the other fields of the blocks of the range are
identical.
In blockinfo
there is also the information to which machine the
identity of the datanode is assigned, and whether the machine is
alive. This is purely informational, and is intended to ease the
implementation of clients.
Checksums are not yet implemented.
The sequence number of the inode is increased whenever new data
is written. It is also included in blockinfo
to simplify the
implementation of caches.
struct blockinfo { hyper index; /* block index */ mediumstring node; /* datanode server as "host:port" ("" if not known) */ mediumstring identity; /* datanode server as identity string */ hyper block; /* block number on this node */ hyper length; /* for how many blocks this info is valid */ bool node_alive; /* informational: whether the node is alive */ mediumstring *checksum; /* optional checksum */ hyper inode_seqno; /* current seqno of the inode */ bool inode_committed; /* whether [inode_seqno] is a committed version */ ticket ticket; /* the access ticket */ };
blocklist
blockinfo
structs for a block index say that the
datanode is down (a broken file)inode_seqno
and inode_committed
fields have all the same
values. This is not broken down per block (it would be possible
that these values "remember" the sequence number when the block
was first committed, resulting in finer granularity of the
information.)
typedef blockinfo blocklist<>;
inodeinfo
inodeinfo
is what is stored for an inode. Documentation is
inline below. Note that inodeinfo
structs may be passed from
the server to the client, and from the client to the server.
In the latter case, the client may not know all fields, or
may use special values in fields.
struct inodeinfo { ftype filetype;
ug usergroup;
user
and group
can be passed in as empty
strings to set the owner to the identity of the client. The empty
strings are then replaced with the real user and group. When reading
inodeinfo
such strings can never be returned.
int mode;
hyper eof;
eof
value is seen as a convention only. The server never
automatically changes it when blocks are allocated or freed.
This means eof
can be set to a position before the last
block or after the last block. It is just the interpretation
of the user to use this number as eof
position.
Conventionally, eof
is only meaningful for regular files.
time mtime; time ctime;
mtime
of
the directory. See the documentation for time
how clients
can request that the server fills in its own current time.
int replication;
Replication is only meaningful for regular files.
hyper blocklimit;
blocklimit
on are not allocated. This
field cannot be set by clients - the field value is ignored.
Note that this is totally unrelated to eof
which can be set to
any value independent on how many blocks are allocated.
Also, there may be holes in the file before blocklimit
.
mediumstring field1;
hyper seqno;
seqno
values are only valid within the transaction.
It is generally possible that seqno
is set to values that were
already generated for previous aborted transactions.
The seqno
makes it possible to
easily check for any file modification. This field is guaranteed
to change for every data or metadata modification.
bool committed;
inodeinfo
struct
is committed data. If false, the struct has been modified by the
transaction. This flag gives valuable information for deciding
whether the struct can be cached or not. This field is automatically
maintained and cannot be set directly.
hyper create_verifier;
bool anonymous;
};
entry
struct entry { mediumstring entry_name; /* basename of a file in a directory */ hyper entry_inode; /* inode of this file */ };
entries
typedef entry entries<>;
fsstat
struct fsstat { hyper total_blocks; hyper used_blocks; hyper trans_blocks;
int enabled_datanodes; int alive_datanodes; mediumstrings dead_datanodes; };
errno_code
enum errno_code { OK = 0, ENOTRANS = 1, /* no transaction */ EFAILEDCOMMIT = 2, /* general commit error */ ELONGTRANS = 3, /* transaction too long */ EFAILED = 4, /* general error */ EPERM = 5, /* not owner, or op is otherwise not permitted */ ENOENT = 6, /* No such file or directory */ EACCESS = 7, /* Permission denied */ EEXIST = 8, /* File exists */ EFHIER = 9, /* File hierarchy violation (e.g. move a directory into its own subdirectory) */ EINVAL = 10, /* invalid argument */ EFBIG = 11, /* file too big */ ENOSPC = 12, /* no space left */ EROFS = 13, /* read-only filesystem */ ENAMETOOLONG = 14, /* filename too long */ ECONFLICT = 15, /* update conflicts with another transaction */ ECOORD = 16, /* this is not the coordinator */ ENONODE = 17, /* unknown node */ ETBUSY = 18, /* transaction is busy (last command not finished) */ ESTALE = 19, /* no such inode */ EIO = 20, /* datanode error, not enough datanodes */ ELOOP = 21, /* looping symlinks */ ENOTDIR = 22, /* operation can only be done for directory */ EISDIR = 23, /* operation can only be done for non-directory */ ENOTEMPTY = 24, /* directory is non-empty but need to be */ EBADPATH = 25 /* a path component is not a directory (POSIX sees this also as ENOTDIR) */ };
Filesystem
RPC's. These results are always unions of the possible error codes
with the special value OK
. For OK
, a value of some type is
returned as result value, and this type is the second parameter.
#define MK_RESULT_TYPE(name,type) \ union name switch(errno_code d) { \ case OK: \ type; \ default: \ void; \ }Creates the types:
rvoid
rinodeinfo
rblocklist
rfsstat
rint
rhyper
rhypers
rlongstring
rlongstrings
rentries
MK_RESULT_TYPE(rvoid,void); MK_RESULT_TYPE(rinodeinfo,inodeinfo t); MK_RESULT_TYPE(rblocklist,blocklist t); MK_RESULT_TYPE(rfsstat,fsstat t); MK_RESULT_TYPE(rint,int t); MK_RESULT_TYPE(rhyper,hyper t); MK_RESULT_TYPE(rhypers,hypers t); MK_RESULT_TYPE(rlongstring,longstring t); MK_RESULT_TYPE(rlongstrings,longstrings t); MK_RESULT_TYPE(rentries,entries t);
ds_info
struct ds_info { int ds_id; mediumstring ds_identity; hyper ds_size; bool ds_enabled; mediumstring *ds_node; bool ds_alive; };The
ds_info
struct is the wire representation of
Nn_datastores.datastore
typedef ds_info ds_info_list<>;
params
struct param { mediumstring name; mediumstring value; }; typedef param params<>; /* Revision numbers have the format: YYYYMMDDHHMMSSUUUUUU:<random hex digits> It is meaningful to sort revision numbers. */ #ifdef SERVER_CONTEXT
readdata
in server context
typedef string readdata<>;
writedata
in server context
typedef _managed string writedata<>;(A managed string is represented differently in Ocamlnet's language mapping layer.)
announcement
in server context
struct announcement { mediumstring ann_clustername; /* clustername */ mediumstring ann_sender; /* sender host:port */ mediumstrings ann_eligible; /* list of hosts that are eligible (host:port syntax) */ mediumstring ann_revision; /* the revision number of the sender */ mediumstring ann_rank; /* configured rank */ hyper ann_random[2]; /* random numbers for self-identification */ }; enum ann_enum { ANN_REJECT = 0, ANN_ACCEPT = 1, ANN_SELF = 2 }; union ann_result switch(ann_enum d) { case ANN_REJECT: void; default: void; };For
Datanode_ctrl.safetrans
:
struct enable_ticket { hyper st_id; hyper st_tmo; hyper st_secret; }; typedef enable_ticket enable_tickets<>; #else
readdata
in client context
typedef _managed string readdata<>;
writedata
in client context
typedef _managed string writedata<>; #endif
dn_channel_enum
More methods may be defined in the future.
enum dn_channel_enum { DNCH_RPC = 0, /* the data is embedded into the RPC channel */ DNCH_SHM = 1 /* the data is exchanged via a POSIX shm object */ };
dn_channel_shm_obj
struct dn_channel_shm_obj { mediumstring shm_path; /* must be a path for POSIX shm */ hyper shm_offset; /* the offset to the start in the file. */ int shm_length; /* the length of the object */ };
dn_channel_rd_req
union dn_channel_rd_req switch (dn_channel_enum d) { case DNCH_RPC: void; case DNCH_SHM: dn_channel_shm_obj ch; };
dn_channel_rd_data
union dn_channel_rd_data switch (dn_channel_enum d) { case DNCH_RPC: readdata data; case DNCH_SHM: void; };
dn_channel_wr_data
union dn_channel_wr_data switch (dn_channel_enum d) { case DNCH_RPC: writedata data; case DNCH_SHM: dn_channel_shm_obj ch; }; #endif