aboutsummaryrefslogtreecommitdiff
path: root/ass2/q3/lcp.py
diff options
context:
space:
mode:
authorakiyamn2021-05-02 17:40:33 +1000
committerakiyamn2021-05-02 17:40:33 +1000
commit1b2233099db80b8c3a513d137045aae7de7a252f (patch)
treeef9d8fbc796ea2bd387c70fabed09f3569cd43d8 /ass2/q3/lcp.py
parent3c60256c2bb81613e7ebc6919fd06d37f3bf318c (diff)
downloadfit3155-1b2233099db80b8c3a513d137045aae7de7a252f.tar.gz
fit3155-1b2233099db80b8c3a513d137045aae7de7a252f.zip
Ass 2: Comments
Diffstat (limited to 'ass2/q3/lcp.py')
-rw-r--r--ass2/q3/lcp.py118
1 files changed, 43 insertions, 75 deletions
diff --git a/ass2/q3/lcp.py b/ass2/q3/lcp.py
index 7e90ae0..6c72771 100644
--- a/ass2/q3/lcp.py
+++ b/ass2/q3/lcp.py
@@ -1,99 +1,67 @@
-from random import randrange
+import sys
+from ass2.ukkonen import ukkonen, Point, Node
-from ass2.ukkonen import ukkonen, Point, char_is_after, Node, skip_count
-from itertools import product
+def longest_prefix(root: Node, s1: str, s2: str, offset: tuple):
+ """
+ Finds the longest prefix of a suffix of two different strings, provided a suffix tree of "s1&s2"
+ The offset tuple represents the start index of each suffix to compare to.
-# s1 = "abdabc"
-# s2 = "daba"
-# s1 = "abaabaaba"
-# s2 = "baabaaabaa"
-
-# def longest_prefix(root: Node, s1: str, s2: str, suffixes: tuple):
-# print(s1[suffixes[0]:])
-# print(s2[suffixes[1]:])
-# matches = 0
-# if suffixes[0] == 0:
-# point = Point(root, s1[0], 1)
-# else:
-# point = skip_count(suffixes[0], Point(root), 0)
-# for i in range(suffixes[1], len(s2) + 1):
-# print(s2[i], end=" ")
-# if char_is_after(point, s2[i]):
-# if point.is_explicit():
-# point = Point(point.node, s2[i], 1)
-# else:
-# point = skip_count(1, point, i)
-# matches += 1
-# print("found")
-# else:
-# print("not found")
-# break
-# return matches
-
-def naive(s1, s2, offset):
- count = 0
- for t in list(map(lambda x: x[0] == x[1], zip(s1[offset[0]:], s2[offset[1]:]))):
- if t:
- count += 1
- else:
- return count
- return count
-
-
-def lcp(root: Node, s1: str, s2: str, offset: tuple):
+ Starting from the root, if the two strings have a matching character, move in that direction of the suffix tree
+ Increment the index to compare the next characters by, by the length of the edge just traveled
+ Stop when both strings 'disagree' on which path to take.
+ This effectively finds the deepest common ancestor of two suffixes on the same tree.
+ """
index = 0
head = Point(root)
shortest = min(len(s1) - offset[0], len(s2) - offset[1])
- while index < shortest:
+ while index < shortest: # Stop searching when you hit the end of the shortest string
if s1[index + offset[0]] == s2[index + offset[1]]:
going_to = head.node.get_child(s1[index + offset[0]])
- head.set_node(going_to)
+ head.set_node(going_to) # Move to new chosen node
index += going_to.edge_length
else:
return index
return index
-def test_everything_for(tree, s1, s2):
- # print(s1, s2)
- for i in range(len(s1)):
- for j in range(len(s2)):
- result = lcp(tree, s1, s2, (i, j))
- a = naive(s1, s2, (i, j))
- assert result == a
+def read_in_string(filename):
+ """
+ Reads in a string from a given file name
+ """
+ with open(filename, "r") as file:
+ return file.read()
-def new_string(max_letters, max_len):
- return "".join([chr(randrange(ord("a"), ord("a") + max_letters)) for _ in range(0, max_len)])
+def create_suffix_list(filename):
+ """
+ From the suffix index file, generate a list of tuples representing each suffix pair to search
+ """
+ tuples = []
+ for line in read_in_string(filename).split("\n"):
+ parts = line.split(" ")
+ tuples.append((int(parts[0]), int(parts[1])))
+ return tuples
-def test(max_letters, max_len):
- alphabet = "".join(map(chr, range(ord("a"), ord("a") + max_letters)))
- counter = 0
- while True:
- s1, s2 = new_string(max_letters, max_len), new_string(max_letters, max_len)
- tree = ukkonen(s1 + "z" + s2)
- test_everything_for(tree, s1, s2)
- counter += 1
- if counter % 1000 == 0:
- print(counter)
+def write_longest_prefixes(root: Node, s1: str, s2: str, suffixes):
+ """
+ Write the longest common suffixes of two strings provided a list of suffix index pairs to file
+ """
+ results = map(lambda t: f"{t[0]} {t[1]} {longest_prefix(root, s1, s2, t)}", suffixes)
+ with open("output_lcp.txt", "w") as file:
+ file.write("\n".join(results))
def main():
- s1 = "abcdacbdab"
- s2 = "dacbdabc"
- print(s1 + "&" + s2)
- tree = ukkonen(s1 + "&" + s2)
- tree.print_tree()
-
- # tup = (4, 5)
- # result = longest_prefix(tree, s1, s2, (0, 5))
- # result = lcp(tree, s1, s2, tup)
- # print(naive(s1, s2, tup))
- # print(result)
+ assert len(sys.argv) == 4 # Need correct number of arguments
+ s1 = read_in_string(sys.argv[1])
+ s2 = read_in_string(sys.argv[2])
+ suffixes = create_suffix_list(sys.argv[3])
+ combined = s1 + "&" + s2
+ tree = ukkonen(combined) # Generate the suffix tree for both strings once
+ write_longest_prefixes(tree, s1, s2, suffixes) # Write answers to file
if __name__ == "__main__":
- test(4, 50)
- # main()
+ main()