// // STREE.CPP - Suffix tree creation // // Mark Nelson, April, 1996 // // This code has been tested with Borland C++ and // Microsoft Visual C++. // // This program asks you for a line of input, then // creates the suffix tree corresponding to the given // text. Additional code is provided to validate the // resulting tree after creation. // #include #include #include #include #include #include // // When a new tree is added to the table, we step // through all the currently defined suffixes from // the active point to the end point. This structure // defines a Suffix by its final character. // In the canonical representation, we define that last // character by starting at a node in the tree, and // following a string of characters, represented by // first_char_index and last_char_index. The two indices // point into the input string. Note that if a suffix // ends at a node, there are no additional characters // needed to characterize its last character position. // When this is the case, we say the node is Explicit, // and set first_char_index > last_char_index to flag // that. // class Suffix { public : int origin_node; int first_char_index; int last_char_index; Suffix( int node, int start, int stop ) : origin_node( node ), first_char_index( start ), last_char_index( stop ){}; int Explicit(){ return first_char_index > last_char_index; } int Implicit(){ return last_char_index >= first_char_index; } void Canonize(); }; // // The suffix tree is made up of edges connecting nodes. // Each edge represents a string of characters starting // at first_char_index and ending at last_char_index. // Edges can be inserted and removed from a hash table, // based on the Hash() function defined here. The hash // table indicates an unused slot by setting the // start_node value to -1. // class Edge { public : int first_char_index; int last_char_index; int end_node; int start_node; void Insert(); void Remove(); Edge(); Edge( int init_first_char_index, int init_last_char_index, int parent_node ); int SplitEdge( Suffix &s ); static Edge Find( int node, int c ); static int Hash( int node, int c ); }; // // The only information contained in a node is the // suffix link. Each suffix in the tree that ends // at a particular node can find the next smaller suffix // by following the suffix_node link to a new node. Nodes // are stored in a simple array. // class Node { public : int suffix_node; Node() { suffix_node = -1; } static int Count; }; // // The maximum input string length this program // will handle is defined here. A suffix tree // can have as many as 2N edges/nodes. The edges // are stored in a hash table, whose size is also // defined here. // const int MAX_LENGTH = 1000; const int HASH_TABLE_SIZE = 2179; //A prime roughly 10% larger // // This is the hash table where all the currently // defined edges are stored. You can dump out // all the currently defined edges by iterating // through the table and finding edges whose start_node // is not -1. // Edge Edges[ HASH_TABLE_SIZE ]; // // The array of defined nodes. The count is 1 at the // start because the initial tree has the root node // defined, with no children. // int Node::Count = 1; Node Nodes[ MAX_LENGTH * 2 ]; // // The input buffer and character count. Please note that N // is the length of the input string -1, which means it // denotes the maximum index in the input buffer. // char T[ MAX_LENGTH ]; int N; // // Necessary forward references // void validate(); int walk_tree( int start_node, int last_char_so_far ); // // The default ctor for Edge just sets start_node // to the invalid value. This