diff options
| author | akiyamn | 2021-05-02 17:40:33 +1000 |
|---|---|---|
| committer | akiyamn | 2021-05-02 17:40:33 +1000 |
| commit | 1b2233099db80b8c3a513d137045aae7de7a252f (patch) | |
| tree | ef9d8fbc796ea2bd387c70fabed09f3569cd43d8 /ass2/q3/lcp.py | |
| parent | 3c60256c2bb81613e7ebc6919fd06d37f3bf318c (diff) | |
| download | fit3155-1b2233099db80b8c3a513d137045aae7de7a252f.tar.gz fit3155-1b2233099db80b8c3a513d137045aae7de7a252f.zip | |
Ass 2: Comments
Diffstat (limited to 'ass2/q3/lcp.py')
| -rw-r--r-- | ass2/q3/lcp.py | 118 |
1 files changed, 43 insertions, 75 deletions
diff --git a/ass2/q3/lcp.py b/ass2/q3/lcp.py index 7e90ae0..6c72771 100644 --- a/ass2/q3/lcp.py +++ b/ass2/q3/lcp.py @@ -1,99 +1,67 @@ -from random import randrange +import sys +from ass2.ukkonen import ukkonen, Point, Node -from ass2.ukkonen import ukkonen, Point, char_is_after, Node, skip_count -from itertools import product +def longest_prefix(root: Node, s1: str, s2: str, offset: tuple): + """ + Finds the longest prefix of a suffix of two different strings, provided a suffix tree of "s1&s2" + The offset tuple represents the start index of each suffix to compare to. -# s1 = "abdabc" -# s2 = "daba" -# s1 = "abaabaaba" -# s2 = "baabaaabaa" - -# def longest_prefix(root: Node, s1: str, s2: str, suffixes: tuple): -# print(s1[suffixes[0]:]) -# print(s2[suffixes[1]:]) -# matches = 0 -# if suffixes[0] == 0: -# point = Point(root, s1[0], 1) -# else: -# point = skip_count(suffixes[0], Point(root), 0) -# for i in range(suffixes[1], len(s2) + 1): -# print(s2[i], end=" ") -# if char_is_after(point, s2[i]): -# if point.is_explicit(): -# point = Point(point.node, s2[i], 1) -# else: -# point = skip_count(1, point, i) -# matches += 1 -# print("found") -# else: -# print("not found") -# break -# return matches - -def naive(s1, s2, offset): - count = 0 - for t in list(map(lambda x: x[0] == x[1], zip(s1[offset[0]:], s2[offset[1]:]))): - if t: - count += 1 - else: - return count - return count - - -def lcp(root: Node, s1: str, s2: str, offset: tuple): + Starting from the root, if the two strings have a matching character, move in that direction of the suffix tree + Increment the index to compare the next characters by, by the length of the edge just traveled + Stop when both strings 'disagree' on which path to take. + This effectively finds the deepest common ancestor of two suffixes on the same tree. + """ index = 0 head = Point(root) shortest = min(len(s1) - offset[0], len(s2) - offset[1]) - while index < shortest: + while index < shortest: # Stop searching when you hit the end of the shortest string if s1[index + offset[0]] == s2[index + offset[1]]: going_to = head.node.get_child(s1[index + offset[0]]) - head.set_node(going_to) + head.set_node(going_to) # Move to new chosen node index += going_to.edge_length else: return index return index -def test_everything_for(tree, s1, s2): - # print(s1, s2) - for i in range(len(s1)): - for j in range(len(s2)): - result = lcp(tree, s1, s2, (i, j)) - a = naive(s1, s2, (i, j)) - assert result == a +def read_in_string(filename): + """ + Reads in a string from a given file name + """ + with open(filename, "r") as file: + return file.read() -def new_string(max_letters, max_len): - return "".join([chr(randrange(ord("a"), ord("a") + max_letters)) for _ in range(0, max_len)]) +def create_suffix_list(filename): + """ + From the suffix index file, generate a list of tuples representing each suffix pair to search + """ + tuples = [] + for line in read_in_string(filename).split("\n"): + parts = line.split(" ") + tuples.append((int(parts[0]), int(parts[1]))) + return tuples -def test(max_letters, max_len): - alphabet = "".join(map(chr, range(ord("a"), ord("a") + max_letters))) - counter = 0 - while True: - s1, s2 = new_string(max_letters, max_len), new_string(max_letters, max_len) - tree = ukkonen(s1 + "z" + s2) - test_everything_for(tree, s1, s2) - counter += 1 - if counter % 1000 == 0: - print(counter) +def write_longest_prefixes(root: Node, s1: str, s2: str, suffixes): + """ + Write the longest common suffixes of two strings provided a list of suffix index pairs to file + """ + results = map(lambda t: f"{t[0]} {t[1]} {longest_prefix(root, s1, s2, t)}", suffixes) + with open("output_lcp.txt", "w") as file: + file.write("\n".join(results)) def main(): - s1 = "abcdacbdab" - s2 = "dacbdabc" - print(s1 + "&" + s2) - tree = ukkonen(s1 + "&" + s2) - tree.print_tree() - - # tup = (4, 5) - # result = longest_prefix(tree, s1, s2, (0, 5)) - # result = lcp(tree, s1, s2, tup) - # print(naive(s1, s2, tup)) - # print(result) + assert len(sys.argv) == 4 # Need correct number of arguments + s1 = read_in_string(sys.argv[1]) + s2 = read_in_string(sys.argv[2]) + suffixes = create_suffix_list(sys.argv[3]) + combined = s1 + "&" + s2 + tree = ukkonen(combined) # Generate the suffix tree for both strings once + write_longest_prefixes(tree, s1, s2, suffixes) # Write answers to file if __name__ == "__main__": - test(4, 50) - # main() + main() |
