not produce empty list of lists in pandas - python-3.x

Background
1) I have the following code to create a df
import pandas as pd
word_list = ['crayons', 'cars', 'camels']
l = ['there are many different crayons in the bright blue box',
'i like a lot of sports cars because they go really fast',
'the middle east has many camels to ride and have fun']
df = pd.DataFrame(l, columns=['Text'])
df
Text
0 there are many different crayons in the bright blue box
1 i like a lot of sports cars because they go really fast
2 the middle east has many camels to ride and have fun
2) And I have the following code to create a function
def find_next_words(row, word_list):
sentence = row[0]
# trigger words are the elements in the word_list
trigger_words = []
next_words = []
last_words = []
for keyword in word_list:
words = sentence.split()
for index in range(0, len(words) - 1):
if words[index] == keyword:
trigger_words.append(keyword)
#get the 3 words that follow trigger word
next_words.append(words[index + 1:index + 4])
#get the 3 words that come before trigger word
#DOES NOT WORK...PRODUCES EMPTY LIST
last_words.append(words[index - 1:index - 4])
return pd.Series([trigger_words, last_words, next_words], index = ['TriggerWords','LastWords', 'NextWords'])
3) This function uses the words in the word_list from above to find the 3 words that come before and after the "trigger_words" in the word_list
4) I then use the following code
df = df.join(df.apply(lambda x: find_next_words(x, word_list), axis=1))
5) And it produce the following df which is close to what I want
Text TriggerWords LastWords NextWords
0 there are many different crayons [crayons] [[]] [[in, the, bright]]
1 i like a lot of sports cars [cars] [[]] [[because, they, go]]
2 the middle east has many camels [camels] [[]] [[to, ride, and]]
Problem
6) However, the LastWords column is an empty list of list [[]] . I think the problem is this line of code last_words.append(words[index - 1:index - 4]) taken from the find_next_words function from above.
7) This is a bit confusing to me because the the NextWords column uses very similar code next_words.append(words[index + 1:index + 4]) taken from the find_next_words function and it works.
Question
8) How do I fix my code so it does not produce the empty list of lists [[]] and instead it gives me the 3 words that come before the words in the word_list?

I think it should be words[max(index - 4, 0):max(index - 1, 0)] in the code.

Related

Counting: How do I add a zero if a word does not occur in a list?

I would like to find keywords from a list, but return a zero if the word does not exist (in this case: part). In this example, collabor occurs 4 times and part 0 times.
My current output is
[['collabor', 4]]
But what I would like to have is
[['collabor', 4], ['part', 0]]
str1 = ["collabor", "part"]
x10 = []
for y in wordlist:
for string in str1:
if y.find(string) != -1:
x10.append(y)
from collections import Counter
x11 = Counter(x10)
your_list = [list(i) for i in x11.items()]
rowssorted = sorted(your_list, key=lambda x: x[0])
print(rowssorted)
Although you have not clearly written your problem and requirements,I think I understood the task.
I assume that you have a set of words that may or may not occur in a given list and you want to print the count of those words based on the occurrence in the given list.
Code:
constants=["part","collabor"]
wordlist = ["collabor", "collabor"]
d={}
for const in constants:
d[const]=0
for word in wordlist:
if word in d:
d[word]+=1
else:
d[word]=0
from collections import Counter
x11 = Counter(d)
your_list = [list(i) for i in x11.items()]
rowssorted = sorted(your_list, key=lambda x: x[0])
print(rowssorted)
output:
[['collabor', 2], ['part', 0]]
This approach gives the required output.
In python, to get the count of occurrence dictionary is popular.
Hope it helps!

Use lambda, apply, and join function on a pandas dataframe

Goal
Apply deid_notes function to df
Background
I have a df that resembles this sample df
import pandas as pd
df = pd.DataFrame({'Text' : ['there are many different types of crayons',
'i like a lot of sports cares',
'the middle east has many camels '],
'P_ID': [1,2,3],
'Word' : ['crayons', 'cars', 'camels'],
'P_Name' : ['John', 'Mary', 'Jacob'],
'N_ID' : ['A1', 'A2', 'A3']
})
#rearrange columns
df = df[['Text','N_ID', 'P_ID', 'P_Name', 'Word']]
df
Text N_ID P_ID P_Name Word
0 many types of crayons A1 1 John crayons
1 i like sports cars A2 2 Mary cars
2 has many camels A3 3 Jacob camels
I use the following function to deidentify certain words within the Text column using NeuroNER http://neuroner.com/
def deid_notes(text):
#use predict function from neuorNER to tag words to be deidentified
ner_list = n1.predict(text)
#n1.predict wont work in this toy example because neuroNER package needs to be installed (and installation is difficult)
#but the output resembles this: [{'start': 1, 'end:' 11, 'id': 1, 'tagged word': crayon}]
#use start and end position of tagged words to deidentify and replace with **BLOCK**
if len(ner_list) > 0:
parts_to_take = [(0, ner_list[0]['start'])] + [(first["end"]+1, second["start"]) for first, second in zip(ner_list, ner_list[1:])] + [(ner_list[-1]['end'], len(text)-1)]
parts = [text[start:end] for start, end in parts_to_take]
deid = '**BLOCK**'.join(parts)
#if n1.predict does not identify any words to be deidentified, place NaN
else:
deid='NaN'
return pd.Series(deid, index='Deid')
Problem
I apply the deid_notes function to my df using the following code
fx = lambda x: deid_notes(x.Text,axis=1)
df.join(df.apply(fx))
But I get the following error
AttributeError: ("'Series' object has no attribute 'Text'", 'occurred at index Text')
Question
How do I get the deid_notes function to work on my df?
Assuming you are returning a pandas series as output from deid_notes function taking text as the only input argument. Pass the axis = 1 argument to the apply instead of died_notes. For eg.
# Dummy function
def deid_notes(text):
deid = 'prediction to: ' + text
return pd.Series(deid, index = ['Deid'])
fx = lambda x: deid_notes(x.Text)
df.join(df.apply(fx, axis =1))

Calculating distance from x and y coordinates from a text file

I'm learning Python 3 and I have a question.
I have a text file 'test.txt' and it's contents are:
1, 2
3, 4
5, 7
#######
10, 11
19, 20
The number on the left side is an x coordinate and the one on the right side is a y coordinate.
I want to get the distance from six digits, and the distance from four digit numbers below '#######'.
Since the distance formula is,
Distance = SQRT((y2-y1)^2 + (x2-x1)^2).
My problem is that I want to extract six numbers, and calculate the total distance from three pairs. For example,
total_distance = SQRT((4-2)^2 + (3-1)^2) + SQRT((7-4)^2 + (5-3)^2).
After that, I want to get (10, 11) and (19, 20). What makes me confused is that how can I skip '#######' and extracting numbers as x and y coordinates.
I began to write code like this:
with open("text.txt") as filestream:
for line in filestream:
currentline = line.split(",")
I'm trying to figure out how to solve this issue. Can you help me out or give some advise on what should I do?
Try using regex:
import re
with open("text.txt") as f:
arr = [[int(l.split(',')[0]), int(l.split(',')[1])] for l in f.readlines() if re.match('\d+, \d+', l)]
And if you need the parts separated:
import re
arr = []
with open("text.txt") as f:
parts = re.compile('#+').split(f.read())
for part in parts:
arr.append([[int(l.split(',')[0]), int(l.split(',')[1])] for l in part.split('\n') if re.match('\d+, \d+', l)])
And all of that is off course, if you are certain that the file exists and has that particular format.
PS: I cant figure out what you are trying to do with those numbers

Create a dataframe with NLTK synonyms

Good Morning,
I am using NLTK to get synonyms out of a frame of words.
print(df)
col_1 col_2
Book 5
Pen 5
Pencil 6
def get_synonyms(df, column_name):
df_1 = df["col_1"]
for i in df_1:
syn = wn.synsets(i)
for synset in list(wn.all_synsets('n'))[:2]:
print(i, "-->", synset)
print("-----------")
for lemma in synset.lemmas():
print(lemma.name())
ci = lemma.name()
return(syn)
And it does work, but I would like to get the following dataframe, with the first "n" synonyms, of each word in "col_1":
print(df_final)
col_1 synonym
Book booklet
Book album
Pen cage
...
I tried initializing an empty list, before both synset and lemma loop, and appending, but in both cases it didn't work; for example:
synonyms = []
for lemma in synset.lemmas():
print(lemma.name())
ci = lemma.name()
synonyms.append(ci)
You can use:
from nltk.corpus import wordnet
from itertools import chain
def get_synonyms(df, column_name, N):
L = []
for i in df[column_name]:
syn = wordnet.synsets(i)
#flatten all lists by chain, remove duplicates by set
lemmas = list(set(chain.from_iterable([w.lemma_names() for w in syn])))
for j in lemmas[:N]:
#append to final list
L.append([i, j])
#create DataFrame
return (pd.DataFrame(L, columns=['word','syn']))
#add number of filtered synonyms
df1 = get_synonyms(df, 'col_1', 3)
print (df1)
word syn
0 Book record_book
1 Book book
2 Book Word
3 Pen penitentiary
4 Pen compose
5 Pen pen
6 Pencil pencil

Searching many strings for many dictionary keys, quickly

I have a unique question, and I am primarily hoping to identify ways to speed up this code a little. I have a set of strings stored in a dataframe, each of which has several names in it and I know the number of names before this step, like so:
print df
description num_people people
'Harry ran with sally' 2 []
'Joe was swinging with sally' 2 []
'Lola Dances alone' 1 []
I am using a dictionary with the keys that I am looking to find in description, like so:
my_dict={'Harry':'1283','Joe':'1828','Sally':'1298', 'Cupid':'1982'}
and then using iterrows to search each string for matches like so:
for index, row in df.iterrows():
row.people=[key for key in my_dict if re.findall(key,row.desciption)]
and when run it ends up with
print df
description num_people people
'Harry ran with sally' 2 ['Harry','Sally']
'Joe was swinging with sally' 2 ['Joe','Sally']
'Lola Dances alone' 1 ['Lola']
The problem that I see, is that this code is still fairly slow to get the job done, and I have a large number of descriptions and over 1000 keys. Is there a faster way of performing this operation, like maybe using the number of people found?
Faster solution:
#strip ' in start and end of text, create lists from words
splited = df.description.str.strip("'").str.split()
#filtering
df['people'] = splited.apply(lambda x: [i for i in x if i in my_dict.keys()])
print (df)
description num_people people
0 'Harry ran with Sally' 2 [Harry, Sally]
1 'Joe was swinging with Sally' 2 [Joe, Sally]
2 'Lola Dances alone' 1 [Lola]
Timings:
#[30000 rows x 3 columns]
In [198]: %timeit (orig(my_dict, df))
1 loop, best of 3: 3.63 s per loop
In [199]: %timeit (new(my_dict, df1))
10 loops, best of 3: 78.2 ms per loop
df['people'] = [[],[],[]]
df = pd.concat([df]*10000).reset_index(drop=True)
df1 = df.copy()
my_dict={'Harry':'1283','Joe':'1828','Sally':'1298', 'Lola':'1982'}
def orig(my_dict, df):
for index, row in df.iterrows():
df.at[index, 'people']=[key for key in my_dict if re.findall(key,row.description)]
return (df)
def new(my_dict, df):
df.description = df.description.str.strip("'")
splited = df.description.str.split()
df.people = splited.apply(lambda x: [i for i in x if i in my_dict.keys()])
return (df)
print (orig(my_dict, df))
print (new(my_dict, df1))

Resources