iterating through values in a dictionary to invert key and values - python-3.x

I am trying to invert an italian-english dictionary using the code that follows.
Some terms have one translation, while others have multiple possibilities. If an entry has multiple translations I iterate through each word, adding it to english-italian dict (if not already present).
If there is a single translation it should not iterate, but as I have written the code, it does. Also only the last translation in the term with multiple translations is added to the dictionary. I cannot figure out how to rewrite the code to resolve what should be a really simple task
from collections import defaultdict
def invertdict():
source_dict ={'paramezzale (s.m.)': ['hog', 'keelson', 'inner keel'], 'vento (s.m.)': 'wind'}
english_dict = defaultdict(list)
for parola, words in source_dict.items():
if len(words) > 1: # more than one translation ?
for word in words: # if true, iterate through each word
word = str(word).strip(' ')
print(word)
else: # only one translation, don't iterate!!
word = str(words).strip(' ')
print(word)
if word in english_dict.keys(): # check to see if the term already exists
if english_dict[word] != parola: # check that the italian is not present
#english_dict[word] = [english_dict[word], parola]
english_dict[word].append(parola).strip('')
else:
english_dict[word] = parola.strip(' ')
print(len(english_dict))
for key,value in english_dict.items():
print(key, value)
When this code is run, I get :
hog
keelson
inner keel
w
i
n
d
2
inner keel paramezzale (s.m.)
d vento (s.m.)
instead of
hog: paramezzale, keelson: paramezzale, inner keel: paramezzale, wind: vento

It would be easier to use lists everywhere in the dictionary, like:
source_dict = {'many translations': ['a', 'b'], 'one translation': ['c']}
Then you need 2 nested loops. Right now you're not always running the inner loop.
for italian_word, english_words in source_dict.items():
for english_word in english_words:
# print, add to english dict, etc.
If you can't change the source_dict format, you need to check the type explicitly. I would transform the single item in a list.
for italian_word, item in source_dict.items():
if not isinstance(item, list):
item = [item]
Full code:
source_dict ={'paramezzale (s.m.)': ['hog', 'keelson', 'inner keel'], 'vento (s.m.)': ['wind']}
english_dict = defaultdict(list)
for parola, words in source_dict.items():
for word in words:
word = str(word).strip(' ')
# add to the list if not already present
# english_dict is a defaultdict(list) so we can use .append directly
if parola not in english_dict[word]:
english_dict[word].append(parola)

Related

Can we get columns names sorted in the order of their tf-idf values (if exists) for each document?

I'm using sklearn TfIdfVectorizer. I'm trying to get the column names in a list in the order of thier tf-idf values in decreasing order for each document? So basically, If a document has all the stop words then we don't need any column names.
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
msg = ["My name is Venkatesh",
"Trying to get the significant words for each vector",
"I want to get the list of words name in the decresasing order of their tf-idf values for each vector",
"is to my"]
stopwords=['is','to','my','the','for','in','of','i','their']
tfidf_vect = TfidfVectorizer(stop_words=stopwords)
tfidf_matrix=tfidf_vect.fit_transform(msg)
pd.DataFrame(tfidf_matrix.toarray(),
columns=tfidf_vect.get_feature_names_out())
I want to generate a column with the list word names in the decreasing order of their tf-idf values
So the column would be like this
['venkatesh','name']
['significant','trying','vector','words','each','get']
['decreasing','idf','list','order','tf','values','want','each','get','name','vector','words']
[] # empty list Since the document consists only stopwords
Above is the primary result I'm looking for, it would be great if we get the sorted dict with tdfidf values as keys and the list of words as values asociated with that tfidf value for each document
So,the result would be like the below
{'0.785288':['venkatesh'],'0.619130':['name']}
{'0.47212':['significant','trying'],'0.372225':['vector','words','each','get']}
{'0.314534':['decreasing','idf','list','order','tf','values','want'],'0.247983':['each','get','name','vector','words']}
{} # empty dict Since the document consists only stopwords
I think this code does what you want and avoids using pandas:
from itertools import groupby
sort_func = lambda v: v[0] # sort by first value in tuple
all_dicts = []
for row in tfidf_matrix.toarray():
sorted_vals = sorted(zip(row, tfidf_vect.get_feature_names()), key=sort_func, reverse=True)
all_dicts.append({val:[g[1] for g in group] for val, group in groupby(sorted_vals, key=sort_func) if val != 0})
You could make it even less readable and put it all in a single comprehension! :-)
The combination of the following function and to_dict() method on dataframe can give you the desired output.
def ret_dict(_dict):
# Get a list of unique values
list_keys = list(set(_dict.values()))
processed_dict = {key:[] for key in list_keys}
# Prepare dictionary
for key, value in _dict.items():
processed_dict[value].append(str(key))
# Sort the keys (as you want)
sorted_keys = sorted(processed_dict, key=lambda x: x, reverse=True)
sorted_keys = [ keys for keys in sorted_keys if keys > 0]
# Return the dictionary with sorted keys
sorted_dict = {k:processed_dict[k] for k in sorted_keys}
return sorted_dict
Then:
res = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vect.get_feature_names_out())
list_dict = res.to_dict('records')
processed_list = []
for _dict in list_dict:
processed_list.append(ret_dict(_dict))
processed_list contains the output you desire. For instance: processed_list[1] would output:
{0.47212002654617047: ['significant', 'trying'], 0.3722248517590162: ['each', 'get', 'vector', 'words']}

How to filter a certain type of python list

I have a list of strings. Each string has the same length/number of characters in the format
xyzw01.ext or xyzv02.ext, etc.
For example
list 1: ['ABCJ01.ext','CDEJ02.ext','ADEJ01.ext','CDEJ01.ext','ABCJ02.ext','CDEJ03.ext']
list 2: ['ABCJ01.ext','ADEJ01.ext','CDEJ01.ext','RPNJ01.ext','PLEJ01.ext']
I would like from these lists to build new lists with only the strings with highest number.
So from list 1 I would like to get
['ADEJ01.ext','ABCJ02.ext','CDEJ03.ext']
while for list 2 I would like to get the same list since all numbers are 01.
Is there a "simple" way of achieving this?
You can use defaultdict and max
from collections import defaultdict
def fun(lst):
res = defaultdict(list)
for x in lst:
res[x[:4]].append(x)
return [max(res[x], key=lambda x: x[4:6]) for x in res]
lst = ['ABCJ01.ext','CDEJ02.ext','ADEJ01.ext','CDEJ01.ext','ABCJ02.ext','CDEJ03.ext']
lst2 = ['ABCJ01.ext','ADEJ01.ext','CDEJ01.ext','RPNJ01.ext','PLEJ01.ext']
print(fun(lst))
print(fun(lst2))
Output:
['ABCJ02.ext', 'CDEJ03.ext', 'ADEJ01.ext']
['ABCJ01.ext', 'ADEJ01.ext', 'CDEJ01.ext', 'RPNJ01.ext', 'PLEJ01.ext']
The easiest way is probably to use an intermediate data structure, like a dict - sort the list items into buckets based on the first part of their names, and then take the maximum number for each bucket. We can just use the built-in max() without a key, since as-given lexicographic sorting works to find the largest. If that's not sufficient, you could use more regex to take the number out of the item and use it as the key instead.
import re
def filter_list(lst):
prefixes = {}
for item in lst:
# use regex to isolate the non-numeric characters at the start of the string
prefix = re.match(r'^([^0-9]*)', item).group(1)
# make a bucket based on each prefix, and put the item in it
prefixes.setdefault(prefix, [])
prefixes[prefix].append(item)
# make a list comprehension taking the maximum item from each bucket
return [max(value) for value in prefixes.values()]
>>> a = ['ABCJ01.ext','CDEJ02.ext','ADEJ01.ext','CDEJ01.ext','ABCJ02.ext','CDEJ03.ext']
>>> b = ['ABCJ01.ext','ADEJ01.ext','CDEJ01.ext','RPNJ01.ext','PLEJ01.ext']
>>> filter_list(a)
['ABCJ02.ext', 'CDEJ03.ext', 'ADEJ01.ext']
>>> filter_list(b)
['ABCJ01.ext', 'ADEJ01.ext', 'CDEJ01.ext', 'RPNJ01.ext', 'PLEJ01.ext']
In python 3.7+, this should preserve the order of list from the first occurrence of each prefix (i.e. CDEJ03.ext will precede ADEJ01.ext in the output because CDEJ02.ext precedes it in the input).
To get the output in the exact same order as the original list, behavior, you'd want to explicitly reassign the key instead of using .setdefault(), perhaps with a pattern like prefixes[prefix] = prefixes[prefix] if prefix in prefixes else [].

Convert everything in a dictionary to lower case, then filter on it?

import pandas as pd
import nltk
import os
directory = os.listdir(r"C:\...")
x = []
num = 0
for i in directory:
x.append(pd.read_fwf("C:\\..." + i))
x[num] = x[num].to_string()
So, once I have a dictionary x = [ ] populated by the read_fwf for each file in my directory:
I want to know how to make it so every single character is lowercase. I am having trouble understanding the syntax and how it is applied to a dictionary.
I want to define a filter that I can use to count for a list of words in this newly defined dictionary, e.g.,
list = [bus, car, train, aeroplane, tram, ...]
Edit: Quick unrelated question:
Is pd_read_fwf the best way to read .txt files? If not, what else could I use?
Any help is very much appreciated. Thanks
Edit 2: Sample data and output that I want:
Sample:
The Horncastle boar's head is an early seventh-century Anglo-Saxon
ornament depicting a boar that probably was once part of the crest of
a helmet. It was discovered in 2002 by a metal detectorist searching
in the town of Horncastle, Lincolnshire. It was reported as found
treasure and acquired for £15,000 by the City and County Museum, where
it is on permanent display.
Required output - changes everything in uppercase to lowercase:
the horncastle boar's head is an early seventh-century anglo-saxon
ornament depicting a boar that probably was once part of the crest of
a helmet. it was discovered in 2002 by a metal detectorist searching
in the town of horncastle, lincolnshire. it was reported as found
treasure and acquired for £15,000 by the city and county museum, where
it is on permanent display.
You shouldn't need to use pandas or dictionaries at all. Just use Python's built-in open() function:
# Open a file in read mode with a context manager
with open(r'C:\path\to\you\file.txt', 'r') as file:
# Read the file into a string
text = file.read()
# Use the string's lower() method to make everything lowercase
text = text.lower()
print(text)
# Split text by whitespace into list of words
word_list = text.split()
# Get the number of elements in the list (the word count)
word_count = len(word_list)
print(word_count)
If you want, you can do it in the reverse order:
# Open a file in read mode with a context manager
with open(r'C:\path\to\you\file.txt', 'r') as file:
# Read the file into a string
text = file.read()
# Split text by whitespace into list of words
word_list = text.split()
# Use list comprehension to create a new list with the lower() method applied to each word.
lowercase_word_list = [word.lower() for word in word_list]
print(word_list)
Using a context manager for this is good since it automatically closes the file for you as soon as it goes out of scope (de-tabbed from with statement block). Otherwise you would have to use file.open() and file.read().
I think there are some other benefits to using context managers, but someone please correct me if I'm wrong.
I think what you are looking for is dictionary comprehension:
# Python 3
new_dict = {key: val.lower() for key, val in old_dict.items()}
# Python 2
new_dict = {key: val.lower() for key, val in old_dict.iteritems()}
items()/iteritems() gives you a list of tuples of the (keys, values) represented in the dictionary (e.g. [('somekey', 'SomeValue'), ('somekey2', 'SomeValue2')])
The comprehension iterates over each of these pairs, creating a new dictionary in the process. In the key: val.lower() section, you can do whatever manipulation you want to create the new dictionary.

How to append and join the center element(when occuring more than once) in a palindrome

I want to make a palindrome lexicographically from a user input string
I take the input string and count each alphabet's occurrence(odd or even) and store them accordingly in a dictionary. Then, I find the centre element and also store the left and right parts in a sorted manner.
Now, how do I continue when the centre element has multiple occurrences?
from collections import Counter
even={}
odd={}
s=input()
s=list(s)
s.sort()
s=Counter(s)
for i,j in s.items():
if j%2==0:
even.update({i:j})
else:
odd.update({i:j})
print(even,odd)
od=list(odd)
ev=list(even)
if len(odd)==1:
center=od[0]
elif len(odd)>1:
print('Not Possible')
elif len(odd)==0:
center=''
right=[]
for i,j in even.items():
right.append(i*int(j/2))
print(right)
left=right[::-1]
print(left)
pal=right+list(center)+left
palin=''.join(pal)
print(palin)
For example, when the input is crocorc,
Output should be corcroc,
But I am stuck at orcrc.
You can check for multiple occurrences of centre element, and add the extra elements in the even list:
if odd[od[0]] > 1:
even[od[0]] = odd[od[0]] - 1
We do a -1 because we have to use one element as the centre element.
Now the issue will be that even will not be sorted so you need to sort it.
even = sorted(even.items(), key=lambda kv: kv[0])
import collections
even = collections.OrderedDict(even)
First line of code above sorts even which returns a list of tuples, and third line converts it back to dictionary.
Here is the finished code
from collections import Counter
even={}
odd={}
s=input()
s=list(s)
s.sort()
s=Counter(s)
for i,j in s.items():
if j%2==0:
even.update({i:j})
else:
odd.update({i:j})
print(even,odd)
od=list(odd)
ev=list(even)
if len(odd)==1:
center=od[0]
elif len(odd)>1:
print('Not Possible')
elif len(odd)==0:
center=''
if odd[od[0]] > 1:
even[od[0]] = odd[od[0]] - 1
right=[]
even = sorted(even.items(), key=lambda kv: kv[0])
import collections
even = collections.OrderedDict(even)
for i,j in even.items():
right.append(i*int(j/2))
print(right)
left=right[::-1]
print(left)
pal=right+list(center)+left
palin=''.join(pal)
print(palin)

How to simplify the function which finds homographs?

I wrote the function which finds homographs in a text.
A homograph is a word that shares the same written form as another
word but has a different meaning.
For this I've used POS-Tagger from NLTK(pos_tag).
POS-tagger processes a sequence of words, and attaches a part of
speech tag to each word.
For example:
[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'),
('completely', 'RB'), ('different', 'JJ')].
Code(Edited):
def find_homographs(text):
homographs_dict = {}
if isinstance(text, str):
text = word_tokenize(text)
tagged_tokens = pos_tag(text)
for tag1 in tagged_tokens:
for tag2 in tagged_tokens:
try:
if dict1[tag2] == tag1:
continue
except KeyError:
if tag1[0] == tag2[0] and tag1[1] != tag2[1]:
dict1[tag1] = tag2
return homographs_dict
It works, But takes too much time, because I've used two cycles for. Please, advice me how can I simplify it and make much faster.
It may seem counterintuitive, but you can easily collect all POS tags for each word in your text, then keep just the words that have multiple tags.
from collections import defaultdict
alltags = defaultdict(set)
for word, tag in tagged_tokens:
alltags[word].add(tag)
homographs = dict((w, tags) for w, tags in alltags.items() if len(tags) > 1)
Note the two-variable loop; it's a lot handier than writing tag1[0] and tag1[1]. defaultdict (and set) you'll have to look up in the manual.
Your output format cannot handle words with three or more POS tags, so the dictionary homographs has words as keys and sets of POS tags as values.
And two more things I would advise: (1) convert all words to lower case to catch more "homographs"; and (2) nltk.pos_tag() expects to be called on one sentence at a time, so you'll get more correct tags if you sent_tokenize() your text and word_tokenize() and pos_tag() each sentence separately.
Here is a suggestion (not tested) but the main idea is to build a dictionary when parsing tagged_tokens, to identify homographs in non-nested loop:
temp_dict = dict()
for tag in tagged_tokens:
temp_dict[tag[0]] = temp_dict.get(tag[0],list()).append(tag[1])
for temp in temp_dict.items():
if len(temp[1]) == 1:
del temp_dict[temp [0]]
print (temp_dict)

Resources