Python generator that returns group of items - python-3.x

I am trying to make a generator that can return a number of consecutive items in a list which "moves" only by one index. Something similar to a moving average filter in DSP. For instance if I have list:
l = [1,2,3,4,5,6,7,8,9]
I would expect this output:
[(1,2,3),(2,3,4),(3,4,5),(4,5,6),(5,6,7),(6,7,8),(7,8,9)]
I have made code but it does not work with filters and generators etc. I am afraid it will also break due to memory if I need to provide a large list of words.
Function gen:
def gen(enumobj, n):
for idx,val in enumerate(enumobj):
try:
yield tuple(enumobj[i] for i in range(idx, idx + n))
except:
break
and the example code:
words = ['aaa','bb','c','dddddd','eeee','ff','g','h','iiiii','jjj','kk','lll','m','m','ooo']
w = filter(lambda x: len(x) > 1, words)
# It's working with list
print('\nList:')
g = gen(words, 4)
for i in g: print(i)
# It's not working with filetrs / generators etc.
print('\nFilter:')
g = gen(w, 4)
for i in g: print(i)
The list for does not produce anything. The code should break because it is not possible to index a filter object. Of course one of the answers is forcing a list: list(w). However, I am looking for better code for the function. How can I change it so that function can accept filters as well etc. I am worried about memory to a huge number of data in a list.
Thanks

With iterators you need to keep track of values that have already been read. An n sized list does the trick. Append the next value to the list and discard the top item after each yield.
import itertools
def gen(enumobj, n):
# we need an iterator for the `next` call below. this creates
# an iterator from an iterable such as a list, but leaves
# iterators alone.
enumobj = iter(enumobj)
# cache the first n objects (fewer if iterator is exhausted)
cache = list(itertools.islice(enumobj, n))
# while we still have something in the cache...
while cache:
yield cache
# drop stale item
cache.pop(0)
# try to get one new item, stopping when iterator is done
try:
cache.append(next(enumobj))
except StopIteration:
# pass to emit progressively smaller units
#pass
# break to stop when fewer than `n` items remain
break
words = ['aaa','bb','c','dddddd','eeee','ff','g','h','iiiii','jjj','kk','lll','m','m','ooo']
w = filter(lambda x: len(x) > 1, words)
# It's working with list
print('\nList:')
g = gen(words, 4)
for i in g: print(i)
# now it works with iterators
print('\nFilter:')
g = gen(w, 4)
for i in g: print(i)

Related

Condensing tuple of tuple of tuples of tuples of... into a list of reduced items

I have an iterable with a lot of nesting - not always to equal depth, but guarantees that each element of the original is at least once-iterable - I define this in the block quote below.
An item is more than once iterable if all its elements are iterable.
An item is once iterable if none of its elements are iterable
An item is twice iterable if its elements are iterables of non iterable elements, e.g. ((1,),(1,2,),(1,2,3,)) is twice iterable
Below, I have a few examples of these such iterables that I consider to be input.
trie_like= ( ((((('a','b','c','d'),), (('a','b','c','e'),),), ((('a','b','d','e'),),),), (((('a','c','d','e'),),),),), ((((('b','c','d','e'),),),),), )
uneven = (((1,),(2,),(3,)),((1,2,3),), (12,34),)
I aim to write function that reduces the level of nesting such that each element in the condensed iterable is a tuple that is once iterable. That is to say the function unnests to the nearest twice iterable object.
trie_like_condensed = (('a','b','c','d'),('a','b','c','e'),('a','b','d','e'), ('a','c','d','e'),('b','c','d','e'),)
uneven_condensed = ((1,),(2,),(3,),(1,2,3),(12,34),)
I thought I might be able to repeatedly call itertools.chain.from_iterable on it tuple until I can map it, but I can't quite get the end condition right.
import itertools
from collections.abc import Iterable
t = ( ((((('a','b','c','d'),), (('a','b','c','e'),),), ((('a','b','d','e'),),),), (((('a','c','d','e'),),),),), ((((('b','c','d','e'),),),),),)
count = 0 # just to avoid inf loops
while isinstance(t,Iterable) and count<20:
count = count + 1
t,p = itertools.tee(itertools.chain.from_iterable(t))
print(list(p))
print(*map("".join, t))
For this I think I've got the depth query wrong, AND it would only work for the variable I called trie_like which has depth 4 at every deepest tuple. I thought maybe then I could use something like itertools.takewhile but I also can't get that to work, my attempt below.
def unnest_iterables(t,count=0):
iter_part = itertools.takewhile(lambda x: isinstance(x,Iterable),t)
niter_part = itertools.takewhile(lambda x: not isinstance(x,Iterable),t)
# rest of t processed by recursion
iter_part,p = itertools.tee(iter_part)
niter_part,pn = itertools.tee(niter_part)
print(f'Depth = {count}: {list(p)} : {list(pn)}')
if count > 10:
print('TEST: Big oof')
return []
if iter_part is None or niter_part is None:
return itertools.chain(itertools.chain.from_iterable(iter_part),niter_part)
try:
return unnest_iterables(itertools.chain(itertools.chain.from_iterable(iter_part),niter_part),count+1)
except StopIteration:
return itertools.chain(itertools.chain.from_iterable(iter_part),niter_part)
t = ( (((((1,2,3,4),), ((1,2,3,5),),), (((1,2,4,5),),),), ((((1,3,4,5),),),),), (((((2,3,4,5),),),),),)
t = ( (((((1,2,3,4),), ((1,2,3,5),),), (((1,2,4,5),),),), ((((1,3,4,5),),),),), ((((2,3,4,5),),),),)
print(*unnest_iterables(t))
Based on your sample input and output, you only need to collect all once-iterable items in the input and put them into a tuple, which make the output a twice-iterable tuple. This can be achieved without the itertools module:
def is_once_iterable(items: tuple):
return any(not isinstance(item, tuple) for item in items)
# Collect all once-iterable items and put them into a list
def _simplify_nesting(items: tuple, condensed: list):
if is_once_iterable(items):
condensed.append(items)
return
for item in items:
if isinstance(item, tuple):
_simplify_nesting(item, condensed)
# Wrapper function for _simplify_nesting
def simplify_nesting(items: tuple):
condensed = []
_simplify_nesting(items, condensed)
return tuple(condensed)
inp = (((1,),(2,),(3,)),((1,2,3),), (12,34),)
print(simplify_nesting(inp)) # ((1,), (2,), (3,), (1, 2, 3), (12, 34))

multiple assignment for functions with *args

I am trying to write a function that will be used on multiple dictionaries of dataframes. My hope is to perform multiple assignments and do it all in one line. for example:
x, y, z = function(x, y, z)
However, with the function, I can't return multiple values for the multiple assignments. This is what I currently have
def split_pre(*args):
for arg in args:
newdict = {}
for key, sheet in arg.items():
if isinstance(sheet, str):
continue
else:
newdict[key] = sheet[sheet.Year < 2000]
return newdict
My thinking is that for each arg it would return the dictionary I created, but I get:
ValueError: too many values to unpack (expected 2)
The inputs to this function would be a dictionary made up of dataframes, e.g.,
x = {1:df, 2:df, 3:df...}
and the desired output would be of the same structure, but with the altered dfs from the function
I'm still quite new to python and this isn't super important, but I was wondering if anyone knew of a succinct way to get at this.
Do you want to return a dictionary per arg?
As already stated by #DeepSpace, Python stops processing the function when the first return command is executed. You can fix your problem in two ways: either create a list where you collect the dictionaries you want to return, or create a generator function:
# Solution with a list
def split_pre(*args):
ans = []
for arg in args:
newdict = {}
for key, sheet in arg.items():
if isinstance(sheet, str):
continue
else:
newdict[key] = sheet[sheet.Year < 2000]
ans.append(newdict)
return ans
or
# Solution with a generator
def split_pre(*args):
for arg in args:
newdict = {}
for key, sheet in arg.items():
if isinstance(sheet, str):
continue
else:
newdict[key] = sheet[sheet.Year < 2000]
yield newdict
In case you call a function in the way you do (a, b, c = func(x, y, z)) both samples are going to work in the same way. But they are not actually the same and I'd recommend using the solution with lists if you're not familiar with generators (you can read more about the yield keyword here)

Number of sub sequences of length K having total sum S, given 2d array

I wish to find Number of sub sequences of length K having total sum S, given an array.
Sample Input:
a=[1,1,1,2,2] & K=2 & S=2
Sample Output:
3 {because a[0],a[1]; a[1]a[2]; a[0]a[2] are only three possible for the case}
I have tried to write a recursive loop in Python for starter but it isn't giving output as expected.Please can you help me find a loophole I might be missing on.
def rec(k, sum1, arr, i=0):
#print('k: '+str(k)+' '+'sum1: '+str(sum1)) #(1) BaseCase:
if(sum1==0 and k!=0): # Both sum(sum1) required and
return 0 # numbers from which sum is required(k)
if(k==0 and sum1 !=0): # should be simultaneously zero
return 0 # Then required subsequences are 1
if(k==0 and sum1==0 ): #
return 1 #
base_check = sum1!=0 or k!=0 #(2) if iterator i reaches final element
if(i==len(arr) and base_check): # in array we should return 0 if both k
return 0 # and sum1 aren't zero
# func rec for getting sum1 from k elements
if(sum1<arr[0]): # takes either first element or rejects it
ans=rec(k-1,sum1,arr[i+1:len(arr)],i+1) # so 2 cases in else loop
print(ans) # i is taken in as iterator to provide array
else: # input to rec func from 2nd element of array
ans=rec(k-1, sum1-arr[0], arr[i+1:len(arr)],i+1)+rec(k, sum1, arr[i+1:len(arr)],i+1)
#print('i: '+str(i)+' ans: '+str(ans))
return(ans)
a=[1,1,1,2,2]
print(rec(2,2,a))
I am still unable to process how to make changes. Once this normal recursive code is written I might go to DP approach accordinlgy.
Using itertools.combinations
Function itertools.combinations returns all the subsequences of a given lengths. Then we filter to keep only subsequences who sum up to the desired value.
import itertools
def countsubsum(a, k, s):
return sum(1 for c in itertools.combinations(a,k) if sum(c)==s)
Fixing your code
Your code looks pretty good, but there are two things that appear wrong about it.
What is this if for?
At first I was a bit confused about if(sum1<arr[0]):. I think you can (and should) always go to the else branch. After thinking about it some more, I understand you are trying to get rid of one of the two recursive calls if arr[0] is too large to be taken, which is smart, but this makes the assumption that all elements in the array are nonnegative. If the array is allowed to contain negative numbers, then you can include a large a[0] in the subsequence, and hope for a negative element to compensate. So if the array can contain negative numbers, you should get rid of this if/else and always execute the two recursive calls from the else branch.
You are slicing wrong
You maintain a variable i to remember where to start in the array; but you also slice the array. Pretty soon your indices become wrong. You should use slices, or use an index i, but not both.
# WRONG
ans=rec(k-1, sum1-arr[0], arr[i+1:len(arr)],i+1)+rec(k, sum1, arr[i+1:len(arr)],i+1)
# CORRECT
ans = rec(k-1, sum1-arr[i], arr, i+1) + rec(k, sum1, arr, i+1)
# CORRECT
ans = rec(k-1, sum1-arr[0], arr[1:]) + rec(k, sum1, arr[1:])
To understand why using both slicing and an index gives wrong results, run the following code:
def iter_array_wrong(a, i=0):
if (a):
print(i, a)
iter_array_wrong(a[i:], i+1)
def iter_array_index(a, i=0):
if i < len(a):
print(i, a)
iter_array_index(a, i+1)
def iter_array_slice(a):
if a:
print(a)
iter_array_slice(a[1:])
print('WRONG')
iter_array_wrong(list(range(10)))
print()
print('INDEX')
iter_array_index(list(range(10)))
print()
print('SLICE')
iter_array_slice(list(range(10)))
Also note that a[i:len(a)] is exactly equivalent to a[i:] and a[0:j] is equivalent to a[:j].
Clean version of the recursion
Recursively count the subsequences who use the first element of the array, and the subsequences who don't use the first element of the array, and add the two counts. To avoid explicitly slicing the array repeatedly, which is an expensive operation, we keep a variable start to remember we are only working on subarray a[start:].
def countsubsum(a, k, s, start=0):
if k == 0:
return (1 if s == 0 else 0)
elif start == len(a):
return 0
else:
using_first_element = countsubsum(a, k-1, s-a[start], start+1)
notusing_first_elem = countsubsum(a, k, s, start+1)
return using_first_element + notusing_first_elem

Fastest way to filter non frequent words inside lists of words

I've a dataset containing lists of tokens in csv format like this:
song, tokens
aaa,"['everyon', 'pict', 'becom', 'somebody', 'know']"
bbb,"['tak', 'money', 'tak', 'prid', 'tak', 'littl']"
First i want to find all the words that appears in text at least a certain amount of time, let's say 5, and this is easily done:
# converters simply reconstruct the string of tokens in a list of tokens
tokens = pd.read_csv('dataset.csv',
converters={'tokens': lambda x: x.strip("[]").replace("'", "").split(", ")})
# List of all words
allwords = [word for tokens in darklyrics['tokens'] for word in tokens]
allwords = pd.DataFrame(allwords, columns=['word'])
more5 = allwords[allwords.groupby("word")["word"].transform('size') >= 5]
more5 = set(more5['word'])
frequentwords = [token.strip() for token in more5]
frequentwords.sort()
Now i want to remove for each list of tokens those who appear inside frequentwords, to do so i'm using this code:
def remove_non_frequent(x):
global frequentwords
output = []
for token in x:
if token in frequentwords:
output.append(token)
return output
def remove_on_chunk(df):
df['tokens'] = df.apply(lambda x: remove_non_frequent(x['tokens']), axis=1)
return df
def parallelize_dataframe(df, func, n_split=10, n_cores=4):
df_split = np.array_split(df, n_split)
pool = Pool(n_cores)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df
lyrics_reconstructed = parallelize_dataframe(lyrics, remove_on_chunk)
The non multiprocess version take around 2.30-3 hours to compute, while this versione takes 1 hour.
Surely it's a slow process because i've to perform the search of circa 130 milions tokens in a list of 30k elements, but i'm quite sure my code is not particularly good.
Is there a faster and surely better way to achieve something like this?
go for Set operations. I've saved your example data to "tt1" file, so this should work. Also, If you are generating the data somehow yourself, do yourself a favour and drop the quotes and square brackets. It would save you time in pre-process.
from collections import Counter
import re
rgx = re.compile(r"[\[\]\"' \n]") # data cleanup
# load and pre-process the data
counter = Counter()
data = []
with open('tt1', 'r') as o:
o.readline()
for line in o:
parts = line.split(',')
clean_parts = {re.sub(rgx, "", i) for i in parts[1:]}
counter.update(clean_parts)
data.append((parts[0], clean_parts))
n = 2 # <- here set threshold for number of occurences
common_words = {i[0] for i in counter.items() if i[1] > n}
# process the data
clean_data = []
for s, r in data:
clean_data.append((s, r - common_words))
It's been a while but i'll post the correct solution to the problem, thantk sto Marek because it's just a slightly modification of his code.
He uses sets which can't handle duplicates, so the obvious idea is to reuse the same code but with multisets.
I've worked with this implementation https://pypi.org/project/multiset/
from collections import Counter
import re
from multiset import Multiset
rgx = re.compile(r"[\[\]\"' \n]") # data cleanup
# load and pre-process the data
counter = Counter()
data = []
with open('tt1', 'r') as o:
o.readline()
for line in o:
parts = line.split(',')
clean_parts = [re.sub(rgx, "", i) for i in parts[1:]]
counter.update(clean_parts)
ms = Multiset()
for word in clean_parts:
ms.add(word)
data.append([parts[0], ms])
n = 2 # <- here set threshold for number of occurences
common_words = Multiset()
# I'm using intersection with the most common words since
# common_words is way smaller than uncommon_words
# Intersection returns the lowest value count between two multisets
# E.g ('sky', 10) and ('sky', 1) will produce ('sky', 1)
# I want the number of repeated words in my document so i set the
# common words counter to be very high
for item in counter.items():
if item[1] >= n:
common_words.add(item[0], 100)
# process the data
clean_data = []
for s, r in data:
clean_data.append((s, r.intersection(common_words)))
output_data = []
for s, ms in clean_data:
tokens = []
for item in ms.items():
for i in range(0, item[1]):
tokens.append(item[0])
output_data.append([s] + [tokens])
This code extracts the most frequent words and filters each document according to this list, on a 110 MB dataset performs the job in less than 2 minutes.

k way merge sort divide and conquer

from math import ceil
def merge(all_lst):
sorted_lst = []
while all_lst:
min_value,index = all_lst[0][0],0
for lst in all_lst:
if lst[0]<min_value:
min_value = lst[0]
index = all_lst.index(lst)
sorted_lst.append(min_value)
all_lst[index].pop(0)
if not all_lst[index]:
all_lst.remove(all_lst[index])
return sorted_lst
def merge_sort(lst, k):
def split(lst):
split_lst = []
j = ceil(len(lst)/k) if len(lst)>=k else 1
for i in range(0,len(lst),j):
split_lst.append(lst[i:i+j])
return split_lst
lst=split(lst)
if len(lst[0])==1:
return lst
else:
for i in range(len(lst)):
lst[i]=merge(merge_sort(lst[i],k))
return merge(lst)
Above is my code for k-way merge sort. Basically what it does is split the list into k smaller list by calling the split function until each sublist in the list is a single element. Then the list containing sublists will be merged into one single list.
My code works fine when splitting is done twice. (eg.[3,6,8,5,2,1,4,7] -> [3,6,8],[5,2,1],[4,7] -> [3],[6],[8],[5],[2],[1],[4],[7]). But when the splitting is done more than twice, (eg,[3,6,8,5,2,1,4,7] -> [3,6,8,5],[2,1,4,7] -> [3,6],[8,5],[2,1],[4,7] -> [3],[6],[8],[5],[2],[1],[4],[7]), the code will fail. Can anyone help find me find out what goes wrong in my code? Thanks in advance.
I believe the problem you're having is that merge_sort sometimes returns a flattened list and other times returns a list of lists. You should probably return a flat list in all cases. There's some other cruft: You don't need split to be its own function, since you only call it the one time.
Here's a greatly simplified version of your code:
def merge_sort(lst, k):
if len(lst) == 1: # simpler base case
return lst
j = ceil(len(lst)/k) # no need to check for k < len(lst) (ceil handles it)
#split and recursively sort in one step
lst = [merge_sort(lst[i:i+j], k) for i in range(0, len(lst), j)]
return merge(lst) # always return a merged list (never a list of lists)

Resources