Plasma GitLab Archive
Projects Blog Knowledge

/* $Id: pfs_types.x 536 2011-12-11 22:59:10Z gerd $ -*- c -*- */

/** Types for the RPC interfaces

 */

/** {b For users of the {!Plasma_client} module:} The types defined
    here are mapped to corresponding Ocaml types, and exported via
    the {!Plasma_rpcapi_aux} module.
 */

/** {b Within the server,} however, the mappings of {!Pfs_rpcapi_aux}
    are used. (These mappings differ in some minor points from the ones
    used for the client.)
*/

#ifndef PFS_TYPES_X
#define PFS_TYPES_X

/** {2 [longstring]} */

typedef string longstring<>;
/** A string up to 4G length */

/** {2 [longstrings]} */

typedef longstring longstrings<>;
/** An array of longstrings */

/** {2 [longstring_opt]} */

typedef longstring *longstring_opt;
/** A longstring option */

/** {2 [mediumstring]} */

typedef string mediumstring<4096>;

/** {2 [mediumstrings]} */

typedef mediumstring mediumstrings<4096>;

/** {2 [hypers]} */

typedef hyper hypers<>;
/** An array of hypers */

/** {2 [trans_id]} */

typedef hyper trans_id;
/** transaction IDs can be used to run several transactions over the same
    TCP connection
*/

/** {2 [ug]} */

struct ug {
    mediumstring user;
    mediumstring group;
};
/** Users and groups are given by name */

/** {2 [time] } */

struct time {
    hyper tsecs;  /* Seconds since the epoch... */
    int   tnsecs; /* plus these nanoseconds */
};
/** [tsecs] and [tnsecs] must be non-negative; [tnsecs < 1E9]. In 
    the filesystem procedure [update_inodeinfo]
    a negative [tsecs] is interpreted as "set the time to the current server
    time"
*/

/** {2 [time_opt]} */

typedef time *time_opt;
/** an optional time struct */

/** {2 [ftype_enum]} */

enum ftype_enum {
    FTYPE_REGULAR = 0,
    FTYPE_DIRECTORY = 1,
    FTYPE_SYMLINK = 2
};
/** File types */

/** {2 [ftype]} */

union ftype switch(ftype_enum d) {
  case FTYPE_REGULAR: 
    void;
  default:
    void;
};
/** File types as union */

/** {2:ticket [ticket]} */

/** The [ticket] is handed out for blocks or block ranges, and permits
    read or write access on a datanode. The ticket is valid for a
    single datanode only, and only for the blocks [range_start] to
    [range_start+range_length-1]. The validity of the ticket is further
    restricted: It is revoked when the current transaction ends. Also,
    it is revoked when the time [safetrans_tmo] is reached.

    The [safetrans_vfy] is a cryptographically computed signature.
*/

struct ticket {
    hyper      range_start;   /* First block */
    hyper      range_length;  /* number of blocks */
    hyper      safetrans_id;  /* safetrans ticket, first part */
    hyper      safetrans_tmo; /* safetrans ticket, timeout */
    hyper      safetrans_vfy; /* safetrans ticket, second part */
    bool       read_perm;     /* whether read permission is granted */
    bool       write_perm;    /* whether write permission is granted */
};

/** {3 Background information} */

/** For securing the communication with the datanode.
    The blocks are accessible for a limited period of time only.
    The [safetrans_id] identifies the datanode transaction. [safetrans_tmo]
    is the point in time when the access times out. After that
    the data nodes will not accept access to the blocks any longer.
    The verifier [safetrans_vfy] is a hash value built from the
    other information, and is used by the data
    node to check that only accessible blocks are written:

    {[safetrans_vfy=extract_64_bits(MD5(safetrans_id ^ "/" ^ 
        safetrans_secret ^ "/" ^ range_start ^ "/" ^ range_length ^ "/" ^
	read_perm ^ "/" ^ write_perm))
     ]}

     (Numbers converted to string via [Int64.to_string], and booleans via
     [string_of_bool].)

     Usually, the safetrans feature is only used for securing 
     block writes. The protocol would also allow it to use it for
     reads, though, and compatible clients should assume this.
 */

/** {2:blockinfo [blockinfo]} */

/** [blockinfo] says where the n-th block of a file is stored on a datanode.
    The number [n] is called the {i block index} (starting at 0). 
    The datanode location is given by the {i identity} of the datanode,
    and the {i block number} of the datanode. Block numbers count from 0
    to [s-1] when [s] is the number of blocks a datanode stores.

    In order to get some compression, adjacent blocks can share the
    same [blockinfo]. In this case, the index and block number in
    [blockinfo] refer to the first block of a range, and the length
    field denotes how long this range is. This method of compression is
    only used when all the other fields of the blocks of the range are
    identical.

    In [blockinfo] there is also the information to which machine the
    identity of the datanode is assigned, and whether the machine is
    alive. This is purely informational, and is intended to ease the
    implementation of clients.

    Checksums are not yet implemented.

    The sequence number of the inode is increased whenever new data
    is written. It is also included in [blockinfo] to simplify the
    implementation of caches.
*/

struct blockinfo {
    hyper      index;         /* block index */
    mediumstring node;          /* datanode server as "host:port" ("" if not known) */
    mediumstring identity;      /* datanode server as identity string */
    hyper      block;         /* block number on this node */
    hyper      length;        /* for how many blocks this info is valid */
    bool       node_alive;    /* informational: whether the node is alive */
    mediumstring *checksum;     /* optional checksum */
    hyper      inode_seqno;   /* current seqno of the inode */
    bool       inode_committed; /* whether [inode_seqno] is a committed version */
    ticket     ticket;        /* the access ticket */
};

/** {2:blocklist [blocklist]} */

/** Block lists describe where the blocks of a file are stored.
    Note that these phenomenons can occur:
     - the same block index can occur several times (replicas)
     - a certain block index does not occur at all (a file hole)
     - all [blockinfo] structs for a block index say that the
       datanode is down (a broken file)

    After allocating blocks or retrieving a blocklist from the server,
    the [inode_seqno] and [inode_committed] fields have all the same
    values. This is not broken down per block (it would be possible
    that these values "remember" the sequence number when the block
    was first committed, resulting in finer granularity of the
    information.)
*/

typedef blockinfo blocklist<>;

/** {2:inodeinfo [inodeinfo]} */

/** [inodeinfo] is what is stored for an inode. Documentation is
    inline below. Note that [inodeinfo] structs may be passed from
    the server to the client, and from the client to the server.
    In the latter case, the client may not know all fields, or
    may use special values in fields.
*/

struct inodeinfo {
    ftype filetype;
    /** {ul {- The file type. Some fields are only meaningful for certain
	  types.}}
    */

    ug    usergroup;
    /** {ul {- The owner. Both [user] and [group] can be passed in as empty
	strings to set the owner to the identity of the client. The empty
	strings are then replaced with the real user and group. When reading
	[inodeinfo] such strings can never be returned.}}
     */

    int   mode;
    /** {ul {- File permission bits}}
     */

    hyper eof;
    /** {ul {- The [eof] value is seen as a convention only. The server never
       automatically changes it when blocks are allocated or freed.
       This means [eof] can be set to a position before the last
       block or after the last block. It is just the interpretation
       of the user to use this number as [eof] position.

       Conventionally, [eof] is only meaningful for regular files.}}
    */

    time  mtime;
    time  ctime;
    /** {ul {- Time fields are not automatically maintained, except that a
        link or unlink operation implicitly updates the [mtime] of
        the directory. See the documentation for [time] how clients
	can request that the server fills in its own current time.}}
     */

    int   replication;   
    /** {ul {- The replication factor the file ought to have. Clients can set it to
        0 to request the default replication factor. Values returned from
        the server are always >= 1.

	Replication is only meaningful for regular files.}}
     */

    hyper blocklimit;
    /** {ul {- The blocks from the index [blocklimit] on are not allocated. This
        field cannot be set by clients - the field value is ignored.

        Note that this is totally unrelated to [eof] which can be set to 
        any value independent on how many blocks are allocated.

        Also, there may be holes in the file before [blocklimit].}}
     */
    
    mediumstring field1;
    /** {ul {- For symlinks this is the target. For other types this field has no
        defined meaning }}
    */

    hyper seqno;
    /** {ul {- This number is increased by the server when blocks are
	allocated or freed, i.e for every content modification of the file,
	or when a new version of the inodeinfo is written. It is not
        possible to change this field directly. The number is increased for each
	metadata operation individually (and not only once
	for the transaction doing so). Until committed, the new [seqno]
	values are only valid within the transaction. 
	It is generally possible that [seqno] is set to values that were
	already generated for previous aborted transactions.

	The [seqno] makes it possible to
        easily check for any file modification. This field is guaranteed 
	to change for every data or metadata modification.}}
     */

    bool committed;
    /** {ul {- This flag is true if the data in the [inodeinfo] struct
	is committed data. If false, the struct has been modified by the
	transaction. This flag gives valuable information for deciding
	whether the struct can be cached or not. This field is automatically
	maintained and cannot be set directly.}}
    */

    hyper create_verifier;
    /** {ul {- intended use as NFSv3 create verifier. Set to 0 outside NFS scope.}}
     */

    bool anonymous;
    /** {ul {- whether this inode does not have a name (read-only)}} */
};


/** {2 [entry]} */

/** Entries of directories */

struct entry {
    mediumstring entry_name;    /* basename of a file in a directory */
    hyper        entry_inode;   /* inode of this file */
};

/** {2 [entries]} */

typedef entry entries<>;

/** {2 [fsstat]} */

struct fsstat {
    hyper    total_blocks;
    hyper    used_blocks;
    hyper    trans_blocks;
    /** {ul {- Blocks in a transitional phase: these are allocated by a transaction,
       but the transaction is not yet committed}}
    */

    int      enabled_datanodes;
    int      alive_datanodes;
    mediumstrings dead_datanodes;
};

/** {2:errno_code [errno_code]} */

enum errno_code {
    OK = 0,
    ENOTRANS = 1,         /* no transaction */
    EFAILEDCOMMIT = 2,    /* general commit error */
    ELONGTRANS = 3,       /* transaction too long */
    EFAILED = 4,          /* general error */
    EPERM = 5,            /* not owner, or op is otherwise not permitted */
    ENOENT = 6,           /* No such file or directory */
    EACCESS = 7,          /* Permission denied */
    EEXIST = 8,           /* File exists */
    EFHIER = 9,           /* File hierarchy violation (e.g. move a directory into its own subdirectory) */
    EINVAL = 10,          /* invalid argument */
    EFBIG = 11,           /* file too big */
    ENOSPC = 12,          /* no space left */
    EROFS = 13,           /* read-only filesystem */
    ENAMETOOLONG = 14,    /* filename too long */
    ECONFLICT = 15,       /* update conflicts with another transaction */
    ECOORD = 16,          /* this is not the coordinator */
    ENONODE = 17,         /* unknown node */
    ETBUSY = 18,          /* transaction is busy (last command not finished) */
    ESTALE = 19,          /* no such inode */
    EIO = 20,             /* datanode error, not enough datanodes */
    ELOOP = 21,           /* looping symlinks */
    ENOTDIR = 22,         /* operation can only be done for directory */
    EISDIR = 23,          /* operation can only be done for non-directory */
    ENOTEMPTY = 24,       /* directory is non-empty but need to be */
    EBADPATH = 25         /* a path component is not a directory (POSIX sees this also as ENOTDIR) */
};

/** {2 Result types of RPC's} */

/** This macro is used for creating the result types of [Filesystem]
    RPC's. These results are always unions of the possible error codes
    with the special value [OK]. For [OK], a value of some type is
    returned as result value, and this type is the second parameter.
*/

#define MK_RESULT_TYPE(name,type)        \
  union name switch(errno_code d) {      \
    case OK:                             \
      type;                              \
    default:                             \
      void;                              \
  }

/** Creates the types:
    - [rvoid]
    - [rinodeinfo]
    - [rblocklist]
    - [rfsstat]
    - [rint]
    - [rhyper]
    - [rhypers]
    - [rlongstring]
    - [rlongstrings]
    - [rentries]

*/

MK_RESULT_TYPE(rvoid,void);
MK_RESULT_TYPE(rinodeinfo,inodeinfo t);
MK_RESULT_TYPE(rblocklist,blocklist t);
MK_RESULT_TYPE(rfsstat,fsstat t);
MK_RESULT_TYPE(rint,int t);
MK_RESULT_TYPE(rhyper,hyper t);
MK_RESULT_TYPE(rhypers,hypers t);
MK_RESULT_TYPE(rlongstring,longstring t);
MK_RESULT_TYPE(rlongstrings,longstrings t);
MK_RESULT_TYPE(rentries,entries t);


/** {2 [ds_info]} */

struct ds_info {
    int        ds_id;
    mediumstring ds_identity;
    hyper      ds_size;
    bool       ds_enabled;
    mediumstring *ds_node;
    bool       ds_alive;
};
/** The [ds_info] struct is the wire representation of 
    {!Nn_datastores.datastore}

    This is only used for internal purposes!
*/

typedef ds_info ds_info_list<>;


/** {2 [dn_info]} */

struct dn_info {
    mediumstring dn_identity;
    hyper        dn_size;
    mediumstring dn_node;
};
/** The externally visible information about datanodes. Such structs only
    describe live datanodes.
*/

typedef dn_info dn_info_list<>;


/** {2 [params]} */

struct param {
    mediumstring name;
    mediumstring value;
};

typedef param params<>;


/* Revision numbers have the format:

   YYYYMMDDHHMMSSUUUUUU:<random hex digits>

   It is meaningful to sort revision numbers.
*/

#ifdef SERVER_CONTEXT

/** {2 [readdata] in server context} */

typedef string readdata<>;

/** {2 [writedata] in server context} */

typedef _managed string writedata<>;

/** (A managed string is represented differently in Ocamlnet's language
    mapping layer.)
*/

/** {2 [announcement] in server context} */

struct announcement {
    mediumstring     ann_clustername;
    /* clustername */

    mediumstring     ann_sender;
    /* sender host:port */

    mediumstrings    ann_eligible;
    /* list of hosts that are eligible (host:port syntax) */

    mediumstring     ann_revision;
    /* the revision number of the sender */

    mediumstring     ann_rank;
    /* configured rank */

    hyper            ann_random[2];
    /* random numbers for self-identification */
};


enum ann_enum {
    ANN_REJECT = 0,
    ANN_ACCEPT = 1,
    ANN_SELF = 2
};


union ann_result switch(ann_enum d) {
case ANN_REJECT:
    void;
default:
    void;
};


/** For [Datanode_ctrl.safetrans]: */

struct enable_ticket {
    hyper st_id;
    hyper st_tmo;
    hyper st_secret;
};

typedef enable_ticket enable_tickets<>;


#else

/** {2 [readdata] in client context} */

typedef _managed string readdata<>;

/** {2 [writedata] in client context} */

typedef _managed string writedata<>;

#endif

/** {2 [dn_channel_enum] } */

/** Block data can be exchanged with the datanode servers on two ways:
    + The block data is included in the normal RPC call as string
    + The block data is put into a shared memory object

  Of course, the second method works only if the client and the server
  are on the same node. Also, the client needs to invoke the server RPC
  via a Unix Domain socket, and not via TCP.

  More methods may be defined in the future.
*/

enum dn_channel_enum {
    DNCH_RPC = 0,   /* the data is embedded into the RPC channel */
    DNCH_SHM = 1    /* the data is exchanged via a POSIX shm object */
};

/** {2:dn_channel_shm_obj [dn_channel_shm_obj] } */

/** This struct identifies a shared memory object */

struct dn_channel_shm_obj {
    mediumstring shm_path;     /* must be a path for POSIX shm */
    hyper        shm_offset;   /* the offset to the start in the file. */
    int          shm_length;   /* the length of the object */
};


/** {2 [dn_channel_rd_req] } */

/** This is the argument for data reads. The client can request the 
    data exchange method. For shared memory, the client also has to
    say which shared memory object will receive the data.
*/

union dn_channel_rd_req switch (dn_channel_enum d) {
case DNCH_RPC:
    void;
case DNCH_SHM:
    dn_channel_shm_obj ch;
};

/** {2 [dn_channel_rd_data] } */

/** This is the return value of the data server for a read request.
    If the data is included in the RPC message, it follows now.
    If the data is put into shared memory, the client can now expect it
    to be there.
*/

union dn_channel_rd_data switch (dn_channel_enum d) {
case DNCH_RPC:
    readdata data;
case DNCH_SHM:
    void;
};

/** {2 [dn_channel_wr_data] } */

/** For write requests, the client either includes the data directly
    in the message, or it has already put it into a shared memory
    objects, and only includes the information where
*/

union dn_channel_wr_data switch (dn_channel_enum d) {
case DNCH_RPC:
    writedata data;
case DNCH_SHM:
    dn_channel_shm_obj ch;
};
#endif

This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml