refs #5139 add documentation for templated omt

git-svn-id: file:///svn/toku/tokudb@45981 c7de825b-a66e-492c-adef-691d508d4ae1
This commit is contained in:
Yoni Fogel 2013-04-17 00:01:00 -04:00
parent 3dbff30332
commit 58dbd4a071

View file

@ -23,11 +23,59 @@
namespace toku { namespace toku {
/**
* Order Maintenance Tree (OMT)
*
* Maintains a collection of totally ordered values, where each value has an integer weight.
* The OMT is a mutable datatype.
*
* The Abstraction:
*
* An OMT is a vector of values, $V$, where $|V|$ is the length of the vector.
* The vector is numbered from $0$ to $|V|-1$.
* Each value has a weight. The weight of the $i$th element is denoted $w(V_i)$.
*
* We can create a new OMT, which is the empty vector.
*
* We can insert a new element $x$ into slot $i$, changing $V$ into $V'$ where
* $|V'|=1+|V|$ and
*
* V'_j = V_j if $j<i$
* x if $j=i$
* V_{j-1} if $j>i$.
*
* We can specify $i$ using a kind of function instead of as an integer.
* Let $b$ be a function mapping from values to nonzero integers, such that
* the signum of $b$ is monotically increasing.
* We can specify $i$ as the minimum integer such that $b(V_i)>0$.
*
* We look up a value using its index, or using a Heaviside function.
* For lookups, we allow $b$ to be zero for some values, and again the signum of $b$ must be monotonically increasing.
* When lookup up values, we can look up
* $V_i$ where $i$ is the minimum integer such that $b(V_i)=0$. (With a special return code if no such value exists.)
* (Rationale: Ordinarily we want $i$ to be unique. But for various reasons we want to allow multiple zeros, and we want the smallest $i$ in that case.)
* $V_i$ where $i$ is the minimum integer such that $b(V_i)>0$. (Or an indication that no such value exists.)
* $V_i$ where $i$ is the maximum integer such that $b(V_i)<0$. (Or an indication that no such value exists.)
*
* When looking up a value using a Heaviside function, we get the value and its index.
*
* We can also split an OMT into two OMTs, splitting the weight of the values evenly.
* Find a value $j$ such that the values to the left of $j$ have about the same total weight as the values to the right of $j$.
* The resulting two OMTs contain the values to the left of $j$ and the values to the right of $j$ respectively.
* All of the values from the original OMT go into one of the new OMTs.
* If the weights of the values don't split exactly evenly, then the implementation has the freedom to choose whether
* the new left OMT or the new right OMT is larger.
*
* Performance:
* Insertion and deletion should run with $O(\log |V|)$ time and $O(\log |V|)$ calls to the Heaviside function.
* The memory required is O(|V|).
*/
template<typename omtdata_t, template<typename omtdata_t,
typename omtdataout_t=omtdata_t> typename omtdataout_t=omtdata_t>
struct omt { struct omt {
/** /**
* * Effect: Create an empty OMT.
* Performance: constant time.
*/ */
void create(void) void create(void)
{ {
@ -43,8 +91,17 @@ struct omt {
} }
/** /**
* * Effect: Create a OMT containing values. The number of values is in numvalues.
* Stores the new OMT in *omtp.
* Requires: this has not been created yet
* Requires: values != NULL
* Requires: values is sorted
* Performance: time=O(numvalues)
* Rationale: Normally to insert N values takes O(N lg N) amortized time.
* If the N values are known in advance, are sorted, and
* the structure is empty, we can batch insert them much faster.
*/ */
__attribute__((nonnull))
void create_from_sorted_array(const omtdata_t *const values, const uint32_t numvalues) void create_from_sorted_array(const omtdata_t *const values, const uint32_t numvalues)
{ {
this->create_internal(numvalues); this->create_internal(numvalues);
@ -53,7 +110,18 @@ struct omt {
} }
/** /**
* * Effect: Create an OMT containing values. The number of values is in numvalues.
* On success the OMT takes ownership of *values array, and sets values=NULL.
* Requires: this has not been created yet
* Requires: values != NULL
* Requires: *values is sorted
* Requires: *values was allocated with toku_malloc
* Requires: Capacity of the *values array is <= new_capacity
* Requires: On success, *values may not be accessed again by the caller.
* Performance: time=O(1)
* Rational: create_from_sorted_array takes O(numvalues) time.
* By taking ownership of the array, we save a malloc and memcpy,
* and possibly a free (if the caller is done with the array).
*/ */
__attribute__((nonnull)) __attribute__((nonnull))
void create_steal_sorted_array(omtdata_t **const values, const uint32_t numvalues, const uint32_t new_capacity) void create_steal_sorted_array(omtdata_t **const values, const uint32_t numvalues, const uint32_t new_capacity)
@ -66,7 +134,16 @@ struct omt {
} }
/** /**
* * Effect: Create a new OMT, storing it in *newomt.
* The values to the right of index (starting at index) are moved to *newomt.
* Requires: newomt != NULL
* Returns
* 0 success,
* EINVAL if index > toku_omt_size(omt)
* On nonzero return, omt and *newomt are unmodified.
* Performance: time=O(n)
* Rationale: We don't need a split-evenly operation. We need to split items so that their total sizes
* are even, and other similar splitting criteria. It's easy to split evenly by calling size(), and dividing by two.
*/ */
__attribute__((nonnull)) __attribute__((nonnull))
int split_at(omt *const newomt, const uint32_t idx) { int split_at(omt *const newomt, const uint32_t idx) {
@ -81,7 +158,10 @@ struct omt {
} }
/** /**
* * Effect: Appends leftomt and rightomt to produce a new omt.
* Creates this as the new omt.
* leftomt and rightomt are destroyed.
* Performance: time=O(n) is acceptable, but one can imagine implementations that are O(\log n) worst-case.
*/ */
__attribute__((nonnull)) __attribute__((nonnull))
void merge(omt *const leftomt, omt *const rightomt) { void merge(omt *const leftomt, omt *const rightomt) {
@ -122,7 +202,10 @@ struct omt {
} }
/** /**
* * Effect: Creates a copy of an omt.
* Creates this as the clone.
* Each element is copied directly. If they are pointers, the underlying data is not duplicated.
* Performance: O(n) or the running time of fill_array_with_subtree_values()
*/ */
void clone(const omt &src) void clone(const omt &src)
{ {
@ -136,7 +219,10 @@ struct omt {
} }
/** /**
* * Effect: Creates a copy of an omt.
* Creates this as the clone.
* Each element is assumed to be a pointer, and the underlying data is duplicated for the clone using toku_malloc.
* Performance: the running time of iterate()
*/ */
void deep_clone(const omt &src) void deep_clone(const omt &src)
{ {
@ -147,7 +233,9 @@ struct omt {
} }
/** /**
* * Effect: Set the tree to be empty.
* Note: Will not reallocate or resize any memory.
* Performance: time=O(1)
*/ */
void clear(void) void clear(void)
{ {
@ -161,6 +249,12 @@ struct omt {
} }
/** /**
* Effect: Destroy an OMT, freeing all its memory.
* If the values being stored are pointers, their underlying data is not freed. See free_items()
* Those values may be freed before or after calling toku_omt_destroy.
* Rationale: Returns no values since free() cannot fail.
* Rationale: Does not free the underlying pointers to reduce complexity.
* Performance: time=O(1)
* *
*/ */
void destroy(void) void destroy(void)
@ -181,7 +275,8 @@ struct omt {
} }
/** /**
* * Effect: return |this|.
* Performance: time=O(1)
*/ */
inline uint32_t size(void) const inline uint32_t size(void) const
{ {
@ -193,7 +288,20 @@ struct omt {
} }
/** /**
* * Effect: Insert value into the OMT.
* If there is some i such that $h(V_i, v)=0$ then returns DB_KEYEXIST.
* Otherwise, let i be the minimum value such that $h(V_i, v)>0$.
* If no such i exists, then let i be |V|
* Then this has the same effect as
* insert_at(tree, value, i);
* If idx!=NULL then i is stored in *idx
* Requires: The signum of h must be monotonically increasing.
* Returns:
* 0 success
* DB_KEYEXIST the key is present (h was equal to zero for some value)
* On nonzero return, omt is unchanged.
* Performance: time=O(\log N) amortized.
* Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now.
*/ */
template<typename omtcmp_t, template<typename omtcmp_t,
int (*h)(const omtdata_t &, const omtcmp_t &)> int (*h)(const omtdata_t &, const omtcmp_t &)>
@ -216,7 +324,14 @@ struct omt {
} }
/** /**
* * Effect: Increases indexes of all items at slot >= idx by 1.
* Insert value into the position at idx.
* Returns:
* 0 success
* EINVAL if idx > this->size()
* On error, omt is unchanged.
* Performance: time=O(\log N) amortized time.
* Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now.
*/ */
int insert_at(const omtdata_t &value, const uint32_t idx) int insert_at(const omtdata_t &value, const uint32_t idx)
{ {
@ -246,6 +361,13 @@ struct omt {
} }
/** /**
* Effect: Replaces the item at idx with value.
* Returns:
* 0 success
* EINVAL if idx>=this->size()
* On error, omt is unchanged.
* Performance: time=O(\log N)
* Rationale: The FT needs to be able to replace a value with another copy of the same value (allocated in a different location)
* *
*/ */
int set_at(const omtdata_t &value, const uint32_t idx) int set_at(const omtdata_t &value, const uint32_t idx)
@ -260,7 +382,14 @@ struct omt {
} }
/** /**
* * Effect: Delete the item in slot idx.
* Decreases indexes of all items at slot > idx by 1.
* Returns
* 0 success
* EINVAL if idx>=this->size()
* On error, omt is unchanged.
* Rationale: To delete an item, first find its index using find or find_zero, then delete it.
* Performance: time=O(\log N) amortized.
*/ */
int delete_at(const uint32_t idx) int delete_at(const uint32_t idx)
{ {
@ -287,7 +416,19 @@ struct omt {
} }
/** /**
* * Effect: Iterate over the values of the omt, from left to right, calling f on each value.
* The first argument passed to f is a ref-to-const of the value stored in the omt.
* The second argument passed to f is the index of the value.
* The third argument passed to f is iterate_extra.
* The indices run from 0 (inclusive) to this->size() (exclusive).
* Requires: f != NULL
* Returns:
* If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by iterate.
* If f always returns zero, then iterate returns 0.
* Requires: Don't modify the omt while running. (E.g., f may not insert or delete values from the omt.)
* Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in the omt.
* Rationale: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read.
* Rationale: We may at some point use functors, but for now this is a smaller change from the old OMT.
*/ */
template<typename iterate_extra_t, template<typename iterate_extra_t,
int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)> int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
@ -296,7 +437,22 @@ struct omt {
} }
/** /**
* * Effect: Iterate over the values of the omt, from left to right, calling f on each value.
* The first argument passed to f is a ref-to-const of the value stored in the omt.
* The second argument passed to f is the index of the value.
* The third argument passed to f is iterate_extra.
* The indices run from 0 (inclusive) to this->size() (exclusive).
* We will iterate only over [left,right)
*
* Requires: left <= right
* Requires: f != NULL
* Returns:
* EINVAL if right > this->size()
* If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by iterate_on_range.
* If f always returns zero, then iterate_on_range returns 0.
* Requires: Don't modify the omt while running. (E.g., f may not insert or delete values from the omt.)
* Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in the omt.
* Rational: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read.
*/ */
template<typename iterate_extra_t, template<typename iterate_extra_t,
int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)> int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
@ -309,7 +465,16 @@ struct omt {
} }
/** /**
* * Effect: Iterate over the values of the omt, from left to right, calling f on each value.
* The first argument passed to f is a pointer to the value stored in the omt.
* The second argument passed to f is the index of the value.
* The third argument passed to f is iterate_extra.
* The indices run from 0 (inclusive) to this->size() (exclusive).
* Requires: same as for iterate()
* Returns: same as for iterate()
* Performance: same as for iterate()
* Rationale: In general, most iterators should use iterate() since they should not modify the data stored in the omt. This function is for iterators which need to modify values (for example, free_items).
* Rationale: We assume if you are transforming the data in place, you want to do it to everything at once, so there is not yet an iterate_on_range_ptr (but there could be).
*/ */
template<typename iterate_extra_t, template<typename iterate_extra_t,
int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)> int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
@ -321,12 +486,24 @@ struct omt {
} }
} }
/**
* Effect: Iterate over the values of the omt, from left to right, freeing each value with toku_free
* Requires: all items in OMT to have been malloced with toku_malloc
* Rational: This function was added due to a problem encountered in ft-ops.c. We needed to free the elements and then
* destroy the OMT. However, destroying the OMT requires invalidating cursors. This cannot be done if the values of the OMT
* have been already freed. So, this function is written to invalidate cursors and free items.
*/
void free_items(void) { void free_items(void) {
this->iterate_ptr<void, free_items_iter>(nullptr); this->iterate_ptr<void, free_items_iter>(nullptr);
} }
/** /**
* * Effect: Set *value=V_idx
* Returns
* 0 success
* EINVAL if index>=toku_omt_size(omt)
* On nonzero return, *value is unchanged
* Performance: time=O(\log N)
*/ */
int fetch(const uint32_t idx, omtdataout_t *const value) const int fetch(const uint32_t idx, omtdataout_t *const value) const
{ {
@ -340,7 +517,22 @@ struct omt {
} }
/** /**
* * Effect: Find the smallest i such that h(V_i, extra)>=0
* If there is such an i and h(V_i,extra)==0 then set *idxp=i, set *value = V_i, and return 0.
* If there is such an i and h(V_i,extra)>0 then set *idxp=i and return DB_NOTFOUND.
* If there is no such i then set *idx=this->size() and return DB_NOTFOUND.
* Note: value is of type omtdataout_t, which may be of type (omtdata_t) or (omtdata_t *) but is fixed by the instantiation.
* If it is the value type, then the value is copied out (even if the value type is a pointer to something else)
* If it is the pointer type, then *value is set to a pointer to the data within the omt.
* This is determined by the type of the omt as initially declared.
* If the omt is declared as omt<foo_t>, then foo_t's will be stored and foo_t's will be returned by find and related functions.
* If the omt is declared as omt<foo_t, foo_t *>, then foo_t's will be stored, and pointers to the stored items will be returned by find and related functions.
* Rationale:
* Structs too small for malloc should be stored directly in the omt.
* These structs may need to be edited as they exist inside the omt, so we need a way to get a pointer within the omt.
* Using separate functions for returning pointers and values increases code duplication and reduces type-checking.
* That also reduces the ability of the creator of a data structure to give advice to its future users.
* Slight overloading in this case seemed to provide a better API and better type checking.
*/ */
template<typename omtcmp_t, template<typename omtcmp_t,
int (*h)(const omtdata_t &, const omtcmp_t &)> int (*h)(const omtdata_t &, const omtcmp_t &)>
@ -358,18 +550,74 @@ struct omt {
return r; return r;
} }
/**
*
*/
template<typename omtcmp_t, template<typename omtcmp_t,
int (*h)(const omtdata_t &, const omtcmp_t &)> int (*h)(const omtdata_t &, const omtcmp_t &)>
int find(const omtcmp_t &extra, int direction, omtdataout_t *const value, uint32_t *const idxp) const int find(const omtcmp_t &extra, int direction, omtdataout_t *const value, uint32_t *const idxp) const
/**
* Effect:
* If direction >0 then find the smallest i such that h(V_i,extra)>0.
* If direction <0 then find the largest i such that h(V_i,extra)<0.
* (Direction may not be equal to zero.)
* If value!=NULL then store V_i in *value
* If idxp!=NULL then store i in *idxp.
* Requires: The signum of h is monotically increasing.
* Returns
* 0 success
* DB_NOTFOUND no such value is found.
* On nonzero return, *value and *idxp are unchanged
* Performance: time=O(\log N)
* Rationale:
* Here's how to use the find function to find various things
* Cases for find:
* find first value: ( h(v)=+1, direction=+1 )
* find last value ( h(v)=-1, direction=-1 )
* find first X ( h(v)=(v< x) ? -1 : 1 direction=+1 )
* find last X ( h(v)=(v<=x) ? -1 : 1 direction=-1 )
* find X or successor to X ( same as find first X. )
*
* Rationale: To help understand heaviside functions and behavor of find:
* There are 7 kinds of heaviside functions.
* The signus of the h must be monotonically increasing.
* Given a function of the following form, A is the element
* returned for direction>0, B is the element returned
* for direction<0, C is the element returned for
* direction==0 (see find_zero) (with a return of 0), and D is the element
* returned for direction==0 (see find_zero) with a return of DB_NOTFOUND.
* If any of A, B, or C are not found, then asking for the
* associated direction will return DB_NOTFOUND.
* See find_zero for more information.
*
* Let the following represent the signus of the heaviside function.
*
* -...-
* A
* D
*
* +...+
* B
* D
*
* 0...0
* C
*
* -...-0...0
* AC
*
* 0...0+...+
* C B
*
* -...-+...+
* AB
* D
*
* -...-0...0+...+
* AC B
*/
{ {
uint32_t tmp_index; uint32_t tmp_index;
uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index; uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index;
if (direction == 0) { invariant(direction != 0);
abort(); if (direction < 0) {
} else if (direction < 0) {
if (this->is_array) { if (this->is_array) {
return this->find_internal_minus_array<omtcmp_t, h>(extra, value, child_idxp); return this->find_internal_minus_array<omtcmp_t, h>(extra, value, child_idxp);
} else { } else {
@ -385,7 +633,7 @@ struct omt {
} }
/** /**
* * Effect: Return the size (in bytes) of the omt, as it resides in main memory. If the data stored are pointers, don't include the size of what they all point to.
*/ */
size_t memory_size(void) { size_t memory_size(void) {
if (this->is_array) { if (this->is_array) {