diff options
| author | akiyamn | 2021-05-02 17:40:33 +1000 | 
|---|---|---|
| committer | akiyamn | 2021-05-02 17:40:33 +1000 | 
| commit | 1b2233099db80b8c3a513d137045aae7de7a252f (patch) | |
| tree | ef9d8fbc796ea2bd387c70fabed09f3569cd43d8 /ass2/q3/lcp.py | |
| parent | 3c60256c2bb81613e7ebc6919fd06d37f3bf318c (diff) | |
| download | fit3155-1b2233099db80b8c3a513d137045aae7de7a252f.tar.gz fit3155-1b2233099db80b8c3a513d137045aae7de7a252f.zip | |
Ass 2: Comments
Diffstat (limited to 'ass2/q3/lcp.py')
| -rw-r--r-- | ass2/q3/lcp.py | 118 | 
1 files changed, 43 insertions, 75 deletions
| diff --git a/ass2/q3/lcp.py b/ass2/q3/lcp.py index 7e90ae0..6c72771 100644 --- a/ass2/q3/lcp.py +++ b/ass2/q3/lcp.py @@ -1,99 +1,67 @@ -from random import randrange +import sys +from ass2.ukkonen import ukkonen, Point, Node -from ass2.ukkonen import ukkonen, Point, char_is_after, Node, skip_count -from itertools import product +def longest_prefix(root: Node, s1: str, s2: str, offset: tuple): +    """ +    Finds the longest prefix of a suffix of two different strings, provided a suffix tree of "s1&s2" +    The offset tuple represents the start index of each suffix to compare to. -# s1 = "abdabc" -# s2 = "daba" -# s1 = "abaabaaba" -# s2 = "baabaaabaa" - -# def longest_prefix(root: Node, s1: str, s2: str, suffixes: tuple): -#     print(s1[suffixes[0]:]) -#     print(s2[suffixes[1]:]) -#     matches = 0 -#     if suffixes[0] == 0: -#         point = Point(root, s1[0], 1) -#     else: -#         point = skip_count(suffixes[0], Point(root), 0) -#     for i in range(suffixes[1], len(s2) + 1): -#         print(s2[i], end=" ") -#         if char_is_after(point, s2[i]): -#             if point.is_explicit(): -#                 point = Point(point.node, s2[i], 1) -#             else: -#                 point = skip_count(1, point, i) -#             matches += 1 -#             print("found") -#         else: -#             print("not found") -#             break -#     return matches - -def naive(s1, s2, offset): -    count = 0 -    for t in list(map(lambda x: x[0] == x[1], zip(s1[offset[0]:], s2[offset[1]:]))): -        if t: -            count += 1 -        else: -            return count -    return count - - -def lcp(root: Node, s1: str, s2: str, offset: tuple): +    Starting from the root, if the two strings have a matching character, move in that direction of the suffix tree +    Increment the index to compare the next characters by, by the length of the edge just traveled +    Stop when both strings 'disagree' on which path to take. +    This effectively finds the deepest common ancestor of two suffixes on the same tree. +    """      index = 0      head = Point(root)      shortest = min(len(s1) - offset[0], len(s2) - offset[1]) -    while index < shortest: +    while index < shortest:  # Stop searching when you hit the end of the shortest string          if s1[index + offset[0]] == s2[index + offset[1]]:              going_to = head.node.get_child(s1[index + offset[0]]) -            head.set_node(going_to) +            head.set_node(going_to)  # Move to new chosen node              index += going_to.edge_length          else:              return index      return index -def test_everything_for(tree, s1, s2): -    # print(s1, s2) -    for i in range(len(s1)): -        for j in range(len(s2)): -            result = lcp(tree, s1, s2, (i, j)) -            a = naive(s1, s2, (i, j)) -            assert result == a +def read_in_string(filename): +    """ +    Reads in a string from a given file name +    """ +    with open(filename, "r") as file: +        return file.read() -def new_string(max_letters, max_len): -    return "".join([chr(randrange(ord("a"), ord("a") + max_letters)) for _ in range(0, max_len)]) +def create_suffix_list(filename): +    """ +    From the suffix index file, generate a list of tuples representing each suffix pair to search +    """ +    tuples = [] +    for line in read_in_string(filename).split("\n"): +        parts = line.split(" ") +        tuples.append((int(parts[0]), int(parts[1]))) +    return tuples -def test(max_letters, max_len): -    alphabet = "".join(map(chr, range(ord("a"), ord("a") + max_letters))) -    counter = 0 -    while True: -        s1, s2 = new_string(max_letters, max_len), new_string(max_letters, max_len) -        tree = ukkonen(s1 + "z" + s2) -        test_everything_for(tree, s1, s2) -        counter += 1 -        if counter % 1000 == 0: -            print(counter) +def write_longest_prefixes(root: Node, s1: str, s2: str, suffixes): +    """ +    Write the longest common suffixes of two strings provided a list of suffix index pairs to file +    """ +    results = map(lambda t: f"{t[0]} {t[1]} {longest_prefix(root, s1, s2, t)}", suffixes) +    with open("output_lcp.txt", "w") as file: +        file.write("\n".join(results))  def main(): -    s1 = "abcdacbdab" -    s2 = "dacbdabc" -    print(s1 + "&" + s2) -    tree = ukkonen(s1 + "&" + s2) -    tree.print_tree() - -    # tup = (4, 5) -    # result = longest_prefix(tree, s1, s2, (0, 5)) -    # result = lcp(tree, s1, s2, tup) -    # print(naive(s1, s2, tup)) -    # print(result) +    assert len(sys.argv) == 4  # Need correct number of arguments +    s1 = read_in_string(sys.argv[1]) +    s2 = read_in_string(sys.argv[2]) +    suffixes = create_suffix_list(sys.argv[3]) +    combined = s1 + "&" + s2 +    tree = ukkonen(combined)  # Generate the suffix tree for both strings once +    write_longest_prefixes(tree, s1, s2, suffixes) # Write answers to file  if __name__ == "__main__": -    test(4, 50) -    # main() +    main() | 
