What is the fastest way to find all the substrings in a string without using any modules and without making duplicates
def lols(s):
if not s:
return 0
lst = []
for i in range (len(s)):
for j in range(i , len(s)+1):
if not s[i:j] :
pass
elif len(s[i:j]) == len(set(s[i:j])):
lst.append(s[i:j])
res = (max(lst , key=len))
s = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!\"#$%&'()*+,-./:;<=>?#[\\]^_`{|}~"
s = s*100
lols(s)
this function works fine with strings smaller than 1000, but it freezes when the example string is used and a time limit is exceeded for large strings
Problem
I recommend that you don't try this with super long strings like your s!
If you're able to install nltk so that it works (I just recently had a problem with that but managed to solve it by installing it to the Windows Sandbox, see here: Python3: Could not find "vcomp140.dll (or one of its dependencies)" while trying to import nltk), then this is one way to do it
from nltk import ngrams
def lols(s):
lst = []
for i in range(1, len(s)):
lst.extend([''.join(j) for j in ngrams(s, i)])
lst.append(s)
return lst
If not, you can do this instead of "from nltk import ngrams".
import collections, itertools
def ngrams(words, n):
"""Edited from https://stackoverflow.com/questions/17531684/n-grams-in-python-four-five-six-grams"""
d = collections.deque(maxlen=n)
d.extend(words[:n])
words = words[n:]
answer = []
for window, word in zip(itertools.cycle((d,)), words):
answer.append(''.join(window))
d.append(word)
answer.append(''.join(window))
return answer
Demo:
>>> lols('username')
['u', 's', 'e', 'r', 'n', 'a', 'm', 'e', 'us', 'se', 'er', 'rn', 'na', 'am', 'me', 'use', 'ser', 'ern', 'rna', 'nam', 'ame', 'user', 'sern', 'erna', 'rnam', 'name', 'usern', 'serna', 'ernam', 'rname', 'userna', 'sernam', 'ername', 'usernam', 'sername', 'username']
Maybe your function performance slows down like n2 or n!. (O(n2) or O(n!)) Or the memory is tight.
About the maximum size of your string which you can print in stdout using print function, since you are have to pass your text as a python object to print function and since the max size of your variable is depend on your platform it could be 2**31 - 1 on a 32-bit platform and 2* *63 - 1 on a 64-bit platform.
for more in information go to sys.maxsize
Related
I'm trying to determine if there is a way to access an index essentially by making a list of lists, where each inner list has a tuple that provides essentially grid coordinates, i.e:
example = [
['a', (0,0)], ['b',(0,1)], ['c', (0,2)],
['d', (1,0)], ['e',(1,1)], ['d', (1,2)],
.....
]
and so on.
So, If I have coordinates (0,1), I want to be able to return example[1][0], or at the very least example[1] since these coordinates correlate with example[1].
I tried using index(), but this doesn't go deep enough. I also looked into itertools, but I cannot find a tool that finds it and doesn't return a boolean.
Using a number pad as an example:
from itertools import chain
def pinpad_test():
pad=[
['1',(0,0)],['2',(0,1)],['3',(0,2)],
['4',(1,0)],['5',(1,1)],['6',(1,2)],
['7',(2,0)],['8',(2,1)],['9',(2,2)],
['0',(3,1)]
]
tester = '1234'
print(tester)
for dig in tester:
print(dig)
if dig in chain(*pad):
print(f'Digit {dig} in pad')
else:
print('Failed')
print('end of tester')
new_test = pad.index((0,1)in chain(*pad))
print(new_test)
if __name__ == '__main__':
pinpad_test()
I get an value error at the initiation of new_test.
You can just yield from simple generator expression:
coords = (0, 1)
idx = next((sub_l[0] for sub_l in pad if sub_l[1] == coords), None)
print(idx)
2
You can create a function that will give you want
def on_coordinates(coordinates:tuple, list_coordinates:list):
return next(x for x in list_coordinatesif x[1] == coordinates)
def bibek():
test_list=[[]]
x=int(input("Enter the length of String elements using enter -: "))
for i in range(0,x):
a=str(input())
a=list(a)
test_list.append(a)
del(test_list[0]):
def filt(b):
d=['b','i','b']
if b in d:
return True
else:
return False
for t in test_list:
x=filter(filt,t)
for i in x:
print(i)
bibek()
suppose test_list=[['b','i','b'],['s','i','b'],['r','i','b']]
output should be ib since ib is common among all
an option is to use set and its methods:
test_list=[['b','i','b'],['s','i','b'],['r','i','b']]
common = set(test_list[0])
for item in test_list[1:]:
common.intersection_update(item)
print(common) # {'i', 'b'}
UPDATE: now that you have clarified your question i would to this:
from difflib import SequenceMatcher
test_list=[['b','i','b','b'],['s','i','b','b'],['r','i','b','b']]
# convert the list to simple strings
strgs = [''.join(item) for item in test_list]
common = strgs[0]
for item in strgs[1:]:
sm = SequenceMatcher(isjunk=None, a=item, b=common)
match = sm.find_longest_match(0, len(item), 0, len(common))
common = common[match.b:match.b+match.size]
print(common) # 'ibb'
the trick here is to use difflib.SequenceMatcher in order to get the longest string.
one more update after clarification of your question this time using collections.Counter:
from collections import Counter
strgs='app','bapp','sardipp', 'ppa'
common = Counter(strgs[0])
print(common)
for item in strgs[1:]:
c = Counter(item)
for key, number in common.items():
common[key] = min(number, c.get(key, 0))
print(common) # Counter({'p': 2, 'a': 1})
print(sorted(common.elements())) # ['a', 'p', 'p']
I got a question for you, first of all the code here:
from urllib import request
from collections import Counter
from nltk import word_tokenize
URL = 'https://www.gutenberg.org/files/46/46-0.txt'
RESPONSE = request.urlopen(URL)
RAW = RESPONSE.read().decode('utf8')
print('\n')
type(RAW)
print('\n')
len(RAW)
TOKENS = word_tokenize(RAW)
print(type(TOKENS))
X = print(len(TOKENS))
print(TOKENS[:X])
print('\n')
c = Counter(RAW)
print(c.most_common(30))
Here is the first Output, I get. With that one I am satisfied.
['\ufeffThe', 'Project', 'Gutenberg', 'EBook', 'of', 'A', 'Christmas', 'Carol', ',', 'by', 'Charles',...]
Here is the second part of the output which do not makes me satisfied:
[(' strong text', 28438), ('e', 16556), ('t', 11960), ('o', 10940), ('a', 10092), ('n', 8868), ('i', 8791),...]
Here is my question: As you can see I am counting the most frequently occuring strings in a text, but the Problem is I want to count the whole elements of the list of words: The final part of second output should look something like that:
[('Dickens', 28438), ('Project', 16556), ('Gutenberg', 11960),...]
and not as you can see above in the second part of output. I want to show the 30 most frequently used Words in the text, and not parts of elements in elements of the list.
Do you know how I can solve that Problem? Thanks for helping.
Try changing this one
c = Counter(TOKENS)
Here attached your full code with change
from urllib import request
from collections import Counter
from nltk import word_tokenize
URL = 'https://www.gutenberg.org/files/46/46-0.txt'
RESPONSE = request.urlopen(URL)
RAW = RESPONSE.read().decode('utf8')
print('\n')
type(RAW)
print('\n')
len(RAW)
TOKENS = word_tokenize(RAW)
print(type(TOKENS))
X = print(len(TOKENS))
print(TOKENS[:X])
print('\n')
c = Counter(TOKENS)
print(c.most_common(500))
I have done my code this far but it is not working properly with remove()..can anyone help me..
'''
Created on Apr 21, 2015
#author: Pallavi
'''
from pip._vendor.distlib.compat import raw_input
print ("Enter Query")
str=raw_input()
fo = open("stopwords.txt", "r+")
str1 = fo.read();
list=str1.split("\n");
fo.close()
words=str.split(" ");
for i in range(0,len(words)):
for j in range(0,len(list)):
if(list[j]==words[i]):
print(words[i])
words.remove(words(i))
Here is the error:
Enter Query
let them cry try diesd
let them try
Traceback (most recent call last):
File "C:\Users\Pallavi\workspace\py\src\parser.py", line 17, in <module>
if(list[j]==words[i]):
IndexError: list index out of range
The errors you have (besides my other comments) are because you're modifying a list while iterating over it. But you take the length of the list at the start, thus, after you've removed some elements, you cannot access the last positions.
I would do it this way:
words = ['a', 'b', 'a', 'c', 'd']
stopwords = ['a', 'c']
for word in list(words): # iterating on a copy since removing will mess things up
if word in stopwords:
words.remove(word)
An even more pythonic way using list comprehensions:
new_words = [word for word in words if word not in stopwords]
As an observation, this could be another elegant way to do it:
new_words = list(filter(lambda w: w not in stop_words, initial_words))
''' call this script in a Bash Konsole like so: python reject.py
purpose of this script: remove certain words from a list of words ,
e.g. remove invalid packages in a request-list using
a list of rejected packages from the logfile,
say on https://fai-project.org/FAIme/#
remove trailing spaces e.g. with KDE Kate in wordlist like so:
kate: remove-trailing-space on; BOM off;
'''
with open("rejects", "r+") as fooo :
stwf = fooo.read()
toreject = stwf.split("\n")
with open("wordlist", "r+") as bar :
woL = bar.read()
words = woL.split("\n")
new_words = [word for word in words if word not in toreject]
with open("cleaned", "w+") as foobar :
for ii in new_words:
foobar.write("%s\n" % ii)
one more easy way to remove words from the list is to convert 2 lists into the set and do a subtraction btw the list.
words = ['a', 'b', 'a', 'c', 'd']
words = set(words)
stopwords = ['a', 'c']
stopwords = set(stopwords)
final_list = words - stopwords
final_list = list(final_list)
Alright so I'm required to eliminate spaces and duplicate values in a list (of only numbers). Here's my code:
def eliminateDuplicates(lst):
i=0
while i<len(lst):
while lst.count(lst[i])!=1:
lst.remove(lst[i])
i=i+1
print(lst)
def main():
a=input("Enter numbers: ")
lst=list(a)
while ' ' in lst:
lst.remove(' ')
eliminateDuplicates(lst)
main()
while this method is effective and works, when the input is say
Enter numbers: 1 2 3 4 5 3 2 1 1 22
The output results in
['4', '5', '3', '1', '2']
I need my program to recognize 22 and 2 as different items so it doesn't delete the last 2 and the 2 in 22. Any suggestions?
EDIT: Sorry to the two posters that have already given me answers. I am not allowed to use the set function, and order does not matter.
This doesn't do what you think it does:
b="".join(a) # doesn't do anything useful since `a` is already a string
lst=list(b) # this is converting the string to a list of characters
Try this instead:
lst = a.split() # automatically cleans up the whitespace for you
print(list(set(lst)))
Turning a list into a set and back again is a handy way to remove duplicates. It's also quite efficient compared to the way you are doing it by scanning the list over and over
If you really want to keep the eliminateDuplicates function then it can just be
def eliminate_duplicates(lst):
return list(set(lst))
def main():
a=input("Enter numbers: ")
lst = a.split() # split automatically cleans up the whitespace
print(eliminate_duplicates(lst))
if __name__ == "__main__":
main()
Edit: since you're not allowed to use set, Collections is another fairly efficient method to remove duplicates
from collections import Counter
def eliminate_duplicates(lst):
return list(Counter(lst))
This is not quite so efficient, but still much better than two nested loops
from itertools import groupby
def eliminate_duplicates(lst):
[k for k,g in groupby(sorted(lst))]
Does order matter? If not cast it to a set and then cast it back to a list.
lst = [1,2,3,3,6,4,5,6, 3, 22]
lst2 = list(set(lst))
Also, you should probably use lst = a.split(' ') rather than join
def main():
a=input("Enter numbers: ") # Input the numbers
clean_a = a.strip(); #Cleans trailing white space.
lst=list(set(clean_a.split(' '))) #Split into tokens, and remove duplicates