Greedy Motif Search in Python - python-3.x

I am studying the Bioinformatics course at Coursera, and have been stuck on the following problem for 5 days:
Implement GreedyMotifSearch.
Input: Integers k and t, followed by a collection of strings Dna.
Output: A collection of strings BestMotifs resulting from applying GreedyMotifSearch(Dna, k, t).
If at any step you find more than one Profile-most probable k-mer in a given string, use the
one occurring first.
Here's my attempt to solve this (I just copied it from my IDE, so pardon any print statements):
def GreedyMotifSearch(DNA, k, t):
"""
Documentation here
"""
import math
bestMotifs = []
bestScore = math.inf
for string in DNA:
bestMotifs.append(string[:k])
base = DNA[0]
for i in window(base, k):
newMotifs = []
for j in range(t):
profile = ProfileMatrix([i])
probable = ProfileMostProbable(DNA[j], k, profile)
newMotifs.append(probable)
if Score(newMotifs) <= bestScore:
bestScore = Score(newMotifs)
bestMotifs = newMotifs
return bestMotifs
The helper functions are these:
def SymbolToNumber(Symbol):
"""
Converts base to number (in lexicograpical order)
Symbol: the letter to be converted (str)
Returns: the number correspondinig to that base (int)
"""
if Symbol == "A":
return 0
elif Symbol == "C":
return 1
elif Symbol == "G":
return 2
elif Symbol == "T":
return 3
def NumberToSymbol(index):
"""
Finds base from number (in lexicographical order)
index: the number to be converted (int)
Returns: the base corresponding to index (str)
"""
if index == 0:
return str("A")
elif index == 1:
return str("C")
elif index == 2:
return str("G")
elif index == 3:
return str("T")
def HammingDistance(p, q):
"""
Finds the number of mismatches between 2 DNA segments of equal lengths
p: first DNA segment (str)
q: second DNA segment (str)
Returns: number of mismatches (int)
"""
return sum(s1 != s2 for s1, s2 in zip(p, q))
def window(s, k):
for i in range(1 + len(s) - k):
yield s[i:i+k]
def ProfileMostProbable(Text, k, Profile):
"""
Finds a k-mer that was most likely to be generated by profile among
all k-mers in Text
Text: given DNA segment (str)
k: length of pattern (int)
Profile: a 4x4 matrix (list)
Returns: profile-most probable k-mer (str)
"""
letter = [[] for key in range(k)]
probable = ""
hamdict = {}
index = 1
for a in range(k):
for j in "ACGT":
letter[a].append(Profile[j][a])
for b in range(len(letter)):
number = max(letter[b])
probable += str(NumberToSymbol(letter[b].index(number)))
for c in window(Text, k):
for x in range(len(c)):
y = SymbolToNumber(c[x])
index *= float(letter[x][y])
hamdict[c] = index
index = 1
for pat, ham in hamdict.items():
if ham == max(hamdict.values()):
final = pat
break
return final
def Count(Motifs):
"""
Documentation here
"""
count = {}
k = len(Motifs[0])
for symbol in "ACGT":
count[symbol] = []
for i in range(k):
count[symbol].append(0)
t = len(Motifs)
for i in range(t):
for j in range(k):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
def FindConsensus(motifs):
"""
Finds a consensus sequence for given list of motifs
motifs: a list of motif sequences (list)
Returns: consensus sequence of motifs (str)
"""
consensus = ""
for i in range(len(motifs[0])):
countA, countC, countG, countT = 0, 0, 0, 0
for motif in motifs:
if motif[i] == "A":
countA += 1
elif motif[i] == "C":
countC += 1
elif motif[i] == "G":
countG += 1
elif motif[i] == "T":
countT += 1
if countA >= max(countC, countG, countT):
consensus += "A"
elif countC >= max(countA, countG, countT):
consensus += "C"
elif countG >= max(countC, countA, countT):
consensus += "G"
elif countT >= max(countC, countG, countA):
consensus += "T"
return consensus
def ProfileMatrix(motifs):
"""
Finds the profile matrix for given list of motifs
motifs: list of motif sequences (list)
Returns: the profile matrix for motifs (list)
"""
Profile = {}
A, C, G, T = [], [], [], []
for j in range(len(motifs[0])):
countA, countC, countG, countT = 0, 0, 0, 0
for motif in motifs:
if motif[j] == "A":
countA += 1
elif motif[j] == "C":
countC += 1
elif motif[j] == "G":
countG += 1
elif motif[j] == "T":
countT += 1
A.append(countA)
C.append(countC)
G.append(countG)
T.append(countT)
Profile["A"] = A
Profile["C"] = C
Profile["G"] = G
Profile["T"] = T
return Profile
def Score(motifs):
"""
Finds score of motifs relative to the consensus sequence
motifs: a list of given motifs (list)
Returns: score of given motifs (int)
"""
consensus = FindConsensus(motifs)
score = 0.0000
for motif in motifs:
score += HammingDistance(consensus, motif)
#print(score)
return round(score, 4)
It seems fine to me. However, when I run this code for quiz problems, it gives an incorrect answer. Their code grading system shows this error:
Failed test #3. Your indexing may be off by one at the beginning of each string in Dna.
I have tried everything I can think of and run this code on all their sample data and debug data, but I simply can't figure out how to make this code work. Please help me with any possible solutions to this.

You have a few problems. I think this should address them all. I've included comments explaining each change along with your original code and a reference to the relevant Pseudocode in the debug data page you linked to.
def GreedyMotifSearch(DNA, k, t):
"""
Documentation here
"""
import math
bestMotifs = []
bestScore = math.inf
for string in DNA:
bestMotifs.append(string[:k])
base = DNA[0]
for i in window(base, k):
# Change here. Should start with one element in motifs and build up.
# As in the line "motifs ← list with only Dna[0](i,k)"
# newMotifs = []
newMotifs = [i]
# Change here to iterate over len(DNA).
# Should go through "for j from 1 to |Dna| - 1"
# for j in range(t):
for j in range(1, len(DNA)):
# Change here. Should build up motifs and build profile using them.
# profile = ProfileMatrix([i])
profile = ProfileMatrix(newMotifs)
probable = ProfileMostProbable(DNA[j], k, profile)
newMotifs.append(probable)
# Change to < rather < = to ensure getting the most recent hit. As referenced in the instructions:
# If at any step you find more than one Profile-most probable k-mer in a given string, use the one occurring **first**.
if Score(newMotifs) < bestScore:
#if Score(newMotifs) <= bestScore:
bestScore = Score(newMotifs)
bestMotifs = newMotifs
return bestMotifs

Related

Is there a way to speed up these functions?

I have these functions. They are working perfectly, but is there a way to speed them up? I tried to split the dataset, but it takes the same or more time than the original functions. I'm working with big arrays (1Mill+X2504X2). create_needed_pos takes arount 350sec for 1.2millX2054X2 array, but my biggest is around 10bilionx2054x2.
#nb.njit
def create_needed_pos(chr_pos, pos):
needed_pos = nb.typed.List.empty_list(nb.int32)
for i in range(len(chr_pos)):
for k in range(len(pos)):
if chr_pos[i] == pos[k]:
if i == k == 1:
needed_pos = nb.typed.List([pos[k]])
else:
needed_pos.append(pos[k])
return needed_pos
#nb.njit
def create_mat(geno):
# create matrix as np.uint8 (1 byte) instead of list of python integers (8 byte)
# also no need to dynamically resize / increase list size
geno_mat = np.zeros((len(geno[:, 0]), len(geno[1, :])), dtype=np.uint8)
for i in np.arange(len(geno[:, 0])):
for k in np.arange(len(geno[1, :])):
g = geno[i, k]
# nested ifs to avoid duplicate comparisons
if g[0] == 0:
if g[1] == 0:
geno_mat[i, k] = 2
elif g[1] == 1:
geno_mat[i, k] = 1
else:
geno_mat[i, k] = 9
elif g[0] == 1:
if g[1] == 0:
geno_mat[i, k] = 1
elif g[1] == 1:
geno_mat[i, k] = 0
else:
geno_mat[i, k] = 9
else:
geno_mat[i, k] = 9
return geno_mat

Appending results from a list to a string

Heavy python beginner here. I want to create a simple function for a PIN guessing game that receives two 4-digit lists ( [guess], [answer] ) and returns a string with 4 letters stating how close I am to guessing the correct [answer] sequence (eg. Higher, True, Lower, Higher)
However, I get a new list for each string:
def checkNumbers(guess,right):
for n in range(4):
result = []
if guess[n] == right[n]:
result.append("T") #true
elif guess[n] < right[n]:
result.append("H") #higher
elif guess[n] > right[n]:
result.append("L") #lower
else:
result.append("F") #false
print (result)
return
checkNumbers([1,2,3,5],[2,2,1,6])
The result should look like this:
checkNumbers([1,2,3,4], [2, 2, 1 , 6]) #call function with ([guess], [answer])
'HTLH' #returns a string stating how accurate [guess] is to [answer] list
Result looks like this however:
checkNumbers([1,2,3,5],[2,2,1,6])
['H']
['T']
['L']
['H']
Thanks very much in advance for any help I could get.
you can use string instead of list or "".join()
def checkNumbers(guess, right):
result = ""
for n in range(4):
if guess[n] == right[n]:
result += "T" # true
elif guess[n] < right[n]:
result += "H" # higher
elif guess[n] > right[n]:
result += "L" # lower
else:
result += "F" # false
print(result)
but... maybe you want to use zip function
def checkNumbers(guess, right):
result = ""
for g, r in zip(guess, right):
if g == r:
result += "T" # true
elif g < r:
result += "H" # higher
elif g > r:
result += "L" # lower
else:
result += "F" # false
print(result)
Funny bonus here:
def checkNumbers(guess, right):
print("".join("THL"[(g > r) + (g != r)] for g, r in zip(guess, right)))
I don't get why you need else part...
Initiate the list and print the result outside of the loop:
def checkNumbers(guess, right):
result = []
for n in range(4):
# do loopy stuff
print (result)
return # not strictly necessary
If you do it inside, you are creating a new list on every iteration.

How to write main function in Python 3?

I am new to python. I got below code, but my main function doesn't work. Can anyone help? Thank you!
The quetion is:
Given a string s, return the last substring of s in lexicographical order.
Example 1:
Input: "abab"
Output: "bab"
Explanation: The substrings are ["a", "ab", "aba", "abab", "b", "ba", "bab"]. The lexicographically maximum substring is "bab".
The IDE said;
Traceback (most recent call last):
File "F:/!!!PDF/!!!PyCharmWorkspace/LeetCode/AA1163LastSubstringInLexicographicalOrder3.py", line
1, in <module>
class Solution:
File "F:/!!!PDF/!!!PyCharmWorkspace/LeetCode/AA1163LastSubstringInLexicographicalOrder3.py", line
20, in Solution
print(lastSubstring(s))
TypeError: lastSubstring() missing 1 required positional argument: 's'
Below is my code.
class Solution:
def lastSubstring(self, s: str) -> str:
i, indexes = 0, list(range(len(s)))
while len(indexes) > 1:
new = []
mx = max([s[i + j] for j in indexes if i + j < len(s)])
for k, j in enumerate(indexes):
if k - 1 >= 0 and indexes[k - 1] + i == j:
continue
if i + j >= len(s):
break
if s[i + j] == mx:
new.append(j)
i += 1
indexes = new
return s[indexes[0]:]
if __name__ == '__main__':
s = "leetcode"
print(lastSubstring(s))
You are trying to use class function while you didn't create an instance of this class.
There are two ways to solve this issue.
First - Declare the lastSubstring as classmethod:
classmethod() methods are bound to class rather than an object. Class
methods can be called by both class and object. These methods can be
call with class or with object. Below examples illustrates above
clearly.
class Solution:
#classmethod
def lastSubstring(self, s: str) -> str:
i, indexes = 0, list(range(len(s)))
while len(indexes) > 1:
new = []
mx = max([s[i + j] for j in indexes if i + j < len(s)])
for k, j in enumerate(indexes):
if k - 1 >= 0 and indexes[k - 1] + i == j:
continue
if i + j >= len(s):
break
if s[i + j] == mx:
new.append(j)
i += 1
indexes = new
return s[indexes[0]:]
if __name__ == '__main__':
s = "leetcode"
print(Solution.lastSubstring(s))
Second - Create an instance from the Solution class first:
if __name__ == '__main__':
s = "leetcode"
sol= Solution()
print(sol.lastSubstring(s))

always got the same output with random.choice in a function

I tried using for-loop to exec my function but its always the same result.
import random
def main(arr):
result = random.choice(arr)
...some code...
return len(result)
for i in range(100):
main(arr)
I could only get diff result from stop/run the terminal. Anyone know why?
my question is the same as this one. random.choice always same
import random
results = []
with open('kargerMinCut.txt') as inputfile:
for line in inputfile:
results.append(line.strip().split('\t'))
def contract(arr):
while len(arr) > 2:
# Generate random number in list of lists
# ranList = random.choice(arr)
ranList = arr[np.random.choice(len(arr))]
ranNum = random.choice(ranList[1:])
# retrieve the index of the random number
listIndex = arr.index(ranList)
for i in range(0, len(arr)):
if arr[i][0] == ranNum:
targetList = i
break
target = ranList[0]
for i in range(0, len(arr)):
if i == listIndex:
arr[i].pop(0)
arr[i] = [x for x in arr[i] if x != ranNum]
elif i == targetList:
arr[i] = [x for x in arr[i] if x != target]
else:
for index, item in enumerate(arr[i]):
if item == target:
arr[i][index] = ranNum
arr[targetList] += arr[listIndex]
del arr[listIndex]
return len(arr[0])-1
the arr would be like this
array = [[1,2,3,4],[2,1,3,4],[3,1,2,4],[4,1,2,3]]
I don't know what you do inside your function but I've got the normal result. And in the question what you linked to the person just used seed. This is kinda pseudorandom that gives you all the time the same random output. Here is the link to deep your knowledge about pseudorandom
import random
arr = [1,2,3,4,5]
def main(arr):
result = random.choice(arr)
print(result)
for i in range(100):
main(arr)
The result is as it has to be:
1
3
5
3
4
3
1
4
4
3
2

Quick sort counting

Python questions again.
I want to count the number of comparison operations performed by quick sort. Because I use a recursive function, I do not think that assigning count = 0 to the beginning of the function body is inappropriate, so I made it as follows.
def QuickSort(lst, count = 0):
if len(lst) > 1:
pivot_idx = len(lst) // 2
smaller_nums, larger_nums = [], []
for idx, num in enumerate(lst):
if idx != pivot_idx:
if num < lst[pivot_idx]:
smaller_nums.append(num)
else:
larger_nums.append(num)
count = QuickSort(smaller_nums, count + 1)[1]
count = QuickSort(larger_nums, count + 1)[1]
lst[:] = smaller_nums + [lst[pivot_idx]] + larger_nums
return lst, count
However, after counting, I confirmed the count which is much lower than my expectation. According to big o, the quick sort would have to show the calculation of n * log (n), but it showed a much lower count. For example, when sorting a list with 1000 random elements, we expected to see a count of 1000 * log (1000) = 6907, but actually only 1164 counts. I am wondering if I am misusing the count in the function or misunderstanding it.
Thank you.
Your post is mistaken on several points:
Big-O is allows arbitrary constant factors and also ignoring the values for "small" values of n, where "small" can be arbitrarily large for any given analysis. So your computations are meaningless.
Your counts are wrong. There's one comparison per loop iteration. You're counting something else.
This is a strange way to code the count. Just use a global variable.
Try this. Note really you're using twice as many comparisons as this reports. The check that the loop index isn't the pivot could be eliminated with a smarter implementation.
c = 0
def QuickSort(lst):
if len(lst) <= 1:
return lst
pivot_idx = len(lst) // 2
smaller, larger = [], []
for idx, num in enumerate(lst):
if idx != pivot_idx:
global c
c += 1
(larger, smaller)[num < lst[pivot_idx]].append(num)
return QuickSort(smaller) + [lst[pivot_idx]] + QuickSort(larger)
def Run(n):
lst = [random.randint(0,1000) for r in xrange(n)]
QuickSort(lst)
print c
Run(1000)
If you're aghast at the prospect of using a global variable, then you can just wrap the sort in a class:
import random
class QuickSort:
def __init__(self):
self.comparisons = 0
def sort(self, lst):
if len(lst) <= 1:
return lst
pivot_idx = len(lst) // 2
smaller, larger = [], []
for idx, num in enumerate(lst):
if idx != pivot_idx:
self.comparisons += 1
(larger, smaller)[num < lst[pivot_idx]].append(num)
return self.sort(smaller) + [lst[pivot_idx]] + self.sort(larger)
def Run(n):
lst = [random.randint(0,1000) for r in xrange(n)]
quicksort = QuickSort()
print quicksort.sort(lst)
print quicksort.comparisons
Run(100)
Building on the answer provided by Gene by adding print statements and a sort "error" range, his example was very helpful to my understanding of quicksort and an error term on the big O impact of operations performance comparison.
class QuickSort:
def __init__(self):
self.comparisons = 0
def sort(self, lst):
k_err = 0 # k << n, the value the sort array can be in error
if len(lst) <= 1:
return lst
pivot_idx = len(lst) // 2
smaller, larger = [], []
for idx, num in enumerate(lst):
if idx != (pivot_idx) :
self.comparisons += 1
try:
(larger, smaller)[(num - k_err) < lst[pivot_idx]].append(num)
except:
(larger, smaller)[(num + k_err) < lst[pivot_idx]].append(num)
print(pivot_idx,"larger", self.comparisons, larger)
print(pivot_idx, "smaller", self.comparisons, smaller, )
return self.sort(smaller) + [lst[pivot_idx]] + self.sort(larger)
def Run(n):
random.seed(100)
lst = [random.randint(0,round(100,0)) for r in range(n)]
quicksort = QuickSort()
print(len(lst), lst)
print(quicksort.sort(lst))
print(quicksort.comparisons, quicksort.comparisons/n, ((quicksort.comparisons/n)/math.log(n,10)), math.log(n,10) )

Resources