How to separate two concatenaded words - python-3.x

I have a review dataset and I want to process it using NLP techniques. I did all the preprocessing stages (remove stop words, stemming, etc.). My problem is that there are some words, which are connected to each other and my function doesn't understand those. Here is an example:
Great services. I had a nicemeal and I love it a lot.
How can I correct it from nicemeal to nice meal?

Peter Norvig has a nice solution to the word segmentation problem that you are encountering. Long story short, he uses a large dataset of word (and bigram) frequencies and some dynamic programming to split long strings of connected words into their most likely segmentation.
You download the zip file with the source code and the word frequencies and adapt it to your use case. Here is the relevant bit, for completeness.
def memo(f):
"Memoize function f."
table = {}
def fmemo(*args):
if args not in table:
table[args] = f(*args)
return table[args]
fmemo.memo = table
return fmemo
#memo
def segment(text):
"Return a list of words that is the best segmentation of text."
if not text: return []
candidates = ([first]+segment(rem) for first,rem in splits(text))
return max(candidates, key=Pwords)
def splits(text, L=20):
"Return a list of all possible (first, rem) pairs, len(first)<=L."
return [(text[:i+1], text[i+1:])
for i in range(min(len(text), L))]
def Pwords(words):
"The Naive Bayes probability of a sequence of words."
return product(Pw(w) for w in words)
#### Support functions (p. 224)
def product(nums):
"Return the product of a sequence of numbers."
return reduce(operator.mul, nums, 1)
class Pdist(dict):
"A probability distribution estimated from counts in datafile."
def __init__(self, data=[], N=None, missingfn=None):
for key,count in data:
self[key] = self.get(key, 0) + int(count)
self.N = float(N or sum(self.itervalues()))
self.missingfn = missingfn or (lambda k, N: 1./N)
def __call__(self, key):
if key in self: return self[key]/self.N
else: return self.missingfn(key, self.N)
def datafile(name, sep='\t'):
"Read key,value pairs from file."
for line in file(name):
yield line.split(sep)
def avoid_long_words(key, N):
"Estimate the probability of an unknown word."
return 10./(N * 10**len(key))
N = 1024908267229 ## Number of tokens
Pw = Pdist(datafile('count_1w.txt'), N, avoid_long_words)
You can also use the segment2 method as it uses bigrams and is much more accurate.

Related

Token indices sequence length is longer than the specified maximum sequence length for this model (28627 > 512)

I am using BERT's Huggingface DistilBERT model as a backend for a question and answer application. The text I am using with which to train the model is one very large single text field. Even though the text field is a single string, the punctuation was left in place as a clue for BERT. When I execute the application I am getting the "Token indices sequence length error". I am using the transformer.encodeplus() method to pass the text into the model. I have tried various mechanisms to truncate the input ids to a length <= to 512.
I am currently using Windows 10 but I will also be porting the code to a Raspberry Pi 4 platform.
The code is failing at this line:
start_scores, end_scores = model(torch.tensor([input_ids]), attention_mask=torch.tensor([attention_mask]))
I am attempting to perform the truncation at this line:
encoding = tokenizer.encode_plus(question, tokenizer(context, truncation=True).input_ids)
The entire code is here:
from transformers import AutoTokenizer, DistilBertTokenizer, DistilBertForQuestionAnswering
import torch
# globals - set once used everywhere
tokenizer = None
model = None
context = ''
def establishSettings():
global tokenizer, model, context
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', return_token_type_ids=True, model_max_length=512)
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad', return_dict=False)
# context = "Some 1,500 volcanoes are still considered potentially active around the world today 161 of those over 10 percent sit within the boundaries of the United States."
# get the volcano corpus
with open('volcanic.corpus', encoding="utf8") as file:
context = file.read().replace('\n', '')
print(len(tokenizer(context, truncation=True).input_ids))
def askQuestion(question):
global tokenizer, model, context
print("\nQuestion ", question)
encoding = tokenizer.encode_plus(question, tokenizer(context, truncation=True).input_ids)
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
start_scores, end_scores = model(torch.tensor([input_ids]), attention_mask=torch.tensor([attention_mask]))
ans_tokens = input_ids[torch.argmax(start_scores): torch.argmax(end_scores) + 1]
answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens, skip_special_tokens=True)
#all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
return answer_tokens
def main():
# set the global itmes once
establishSettings()
# ask a question
question = "How many potentially active volcanoes are there in the world today?"
answer_tokens = askQuestion(question)
print("answer_tokens: ", answer_tokens)
if len(answer_tokens) == 0:
answer = "Sorry, I don't have an answer for that one. Ask me another question about New Mexico volcanoes."
print(answer)
else:
answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens)
print("\nFinal Answer : ")
print(answer_tokens_to_string)
if __name__ == '__main__':
main()
What is the best way to truncate the input.ids to <= 512 in length.
Edit this line:
encoding = tokenizer.encode_plus(question, tokenizer(context, truncation=True).input_ids)
to
encoding = tokenizer.encode_plus(question, tokenizer(context, truncation=True, max_length=512).input_ids)

How to remove key error from the program?

In the image you can see that i have ID still getting key error I am trying to do a recommendation algorithm so i got this error
#the first argument in the below function to be passed is the id of the book, second argument is the number of books you want to be recommended#
KeyError: <built-in function id>
I am sharing link of article https://towardsdatascience.com/recommender-engine-under-the-hood-7869d5eab072
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
ds = pd.read_csv("test1.csv") #you can plug in your own list of products or movies or books here as csv file#
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
#ngram explanation begins#
#ngram (1,3) can be explained as follows#
#ngram(1,3) encompasses uni gram, bi gram and tri gram
#consider the sentence "The ball fell"
#ngram (1,3) would be the, ball, fell, the ball, ball fell, the ball fell
#ngram explanation ends#
tfidf_matrix = tf.fit_transform(ds['Book Title'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
results = {} # dictionary created to store the result in a dictionary format (ID :
(Score,item_id))#
for idx, row in ds.iterrows(): #iterates through all the rows
# the below code 'similar_indice' stores similar ids based on cosine similarity. sorts them in ascending
order. [:-5:-1] is then used so that the indices with most similarity are got. 0 means no similarity and 1 means perfect similarity#
similar_indices = cosine_similarities[idx].argsort()[:-5:-1]
#stores 5 most similar books, you can change it as per your needs
similar_items = [(cosine_similarities[idx][i], ds['ID'][i]) for i in similar_indices]
results[row['ID']] = similar_items[1:]
#below code 'function item(id)' returns a row matching the id along with Book Title. Initially it is a dataframe, then we convert it to a list#
def item(id):
return ds.loc[ds['ID'] == id]['Book Title'].tolist()[0]
def recommend(id, num):
if (num == 0):
print("Unable to recommend any book as you have not chosen the number of book to be
recommended")
elif (num==1):
print("Recommending " + str(num) + " book similar to " + item(id))
else :
print("Recommending " + str(num) + " books similar to " + item(id))
print("----------------------------------------------------------")
recs = results[id][:num]
for rec in recs:
print("You may also like to read: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")
#the first argument in the below function to be passed is the id of the book, second argument is the number of books you want to be recommended#
recommend(5,2)
i have try and run successfully till results variable then getting error.
because python default id keyword is called when you call "def item(id):"
instead of id you have to declare another identifier....then i think this is the only reason for keyerror..
As the error suggests id is an build-in function in python-3. So if you change the name of the parameters id in def item(id) and def recommend(id, num) and all their references then the code should work.
After changing the id and correcting the indentation, an example could look like this:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
ds = pd.read_csv("test1.csv") # you can plug in your own list of products or movies or books here as csv file
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
# ngram explanation begins#
# ngram (1,3) can be explained as follows#
# ngram(1,3) encompasses uni gram, bi gram and tri gram
# consider the sentence "The ball fell"
# ngram (1,3) would be the, ball, fell, the ball, ball fell, the ball fell
# ngram explanation ends#
tfidf_matrix = tf.fit_transform(ds['Book Title'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
results = {} # dictionary created to store the result in a dictionary format (ID : (Score,item_id))
for idx, row in ds.iterrows(): # iterates through all the rows
# the below code 'similar_indice' stores similar ids based on cosine similarity. sorts them in ascending
# order. [:-5:-1] is then used so that the indices with most similarity are got. 0 means no similarity and
# 1 means perfect similarity
similar_indices = cosine_similarities[idx].argsort()[:-5:-1]
# stores 5 most similar books, you can change it as per your needs
similar_items = [(cosine_similarities[idx][i], ds['ID'][i]) for i in similar_indices]
results[row['ID']] = similar_items[1:]
# below code 'function item(id)' returns a row matching the id along with Book Title. Initially it is a dataframe,
# then we convert it to a list#
def item(ID):
return ds.loc[ds['ID'] == ID]['Book Title'].tolist()[0]
def recommend(ID, num):
if num == 0:
print("Unable to recommend any book as you have not chosen the number of book to be recommended")
elif num == 1:
print("Recommending " + str(num) + " book similar to " + item(ID))
else:
print("Recommending " + str(num) + " books similar to " + item(ID))
print("----------------------------------------------------------")
recs = results[ID][:num]
for rec in recs:
print("You may also like to read: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")
# the first argument in the below function to be passed is the id of the book, second argument is the number of books
# you want to be recommended
recommend(5, 2)

Incorrect graph in Word Ladder challenge

I am trying to implement the word ladder problem where I have to convert one word to another in shortest path possible.Obviously we can use the breadth first search (BFS) to solve it but before that we have to first draw the graph.I have implemented the concept of buckets where certain words fall under a bucket if they match the bucket type.But my graph is not implementing correctly.
The given word list is ["CAT", "BAT", "COT", "COG", "COW", "RAT", "BUT", "CUT", "DOG", "WED"]
So for each word I can create a bucket.For example for the word 'CAT', I can have three buckets _AT, C_T, CA_. Similarly I can create buckets for the rest of the words and which ever words match the bucket type will fall under those buckets.
Implementing with hand should give me a graph like this
Since the graph is undirected, so for the vertex COG, its neighbouring vertices should be DOG, COW, COT (relationship work both ways) but instead I am getting COG is connected to nothing.Here is my code below
class Vertex:
def __init__(self,key):
self.id = key
self.connectedTo = {}
def addNeighbour(self,nbr,weight=0):
self.connectedTo[nbr] = weight
#string representation of the object
def __str__(self):
return str(self.id) + " is connected to " + str([x.id for x in self.connectedTo])
def getConnections(self):
return self.connectedTo.keys()
def getId(self):
return self.id
def getWeight(self,nbr):
return self.connectedTo[nbr]
class Graph:
def __init__(self):
self.vertList = {}
self.numVertices = 0
def addVertex(self,key):
self.numVertices += 1
newVertex = Vertex(key)
self.vertList[key] = newVertex
return newVertex
def getVertex(self,n):
if n in self.vertList:
return self.vertList[n]
else:
return None
def addEdge(self,f,t,cost=0):
if f not in self.vertList:
nv = self.addVertex(f)
if t not in self.vertList:
nv = self.addVertex(t)
self.addVertex(f).addNeighbour(self.addVertex(t),cost)
def getVertices(self):
return self.vertList.keys()
def __iter__(self):
return iter(self.vertList.values())
wordList = ["CAT", "BAT", "COT", "COG", "COW", "RAT", "BUT", "CUT", "DOG", "WED"]
def buildGraph(wordList):
d = {} #in this dictionary the buckets will be the keys and the words will be their values
g = Graph()
for i in wordList:
for j in range(len(i)):
bucket = i[:j] + "_" + i[j+1:]
if bucket in d:
#we are storing the words that fall under the same bucket in a list
d[bucket].append(i)
else:
d[bucket] = [i]
# create vertices for the words under the buckets and join them
#print("Dictionary",d)
for bucket in d.keys():
for word1 in d[bucket]:
for word2 in d[bucket]:
#we ensure same words are not treated as two different vertices
if word1 != word2:
g.addEdge(word1,word2)
return g
# get the graph object
gobj = buildGraph(wordList)
for v in gobj: #the graph contains a set of vertices
print(v)
The result I get is
BUT is connected to ['BAT']
CUT is connected to ['COT']
COW is connected to ['COG']
COG is connected to []
CAT is connected to []
DOG is connected to ['COG']
RAT is connected to ['BAT']
COT is connected to []
BAT is connected to []
I was hoping the results to be something like
BUT is connected to ['BAT', 'CUT']
CUT is connected to ['CAT', 'COT', 'BUT']
and so on....
What am I doing wrong?
The problem is in your addEdge method.
You are checking if vertices are already present in graph, ok. But if they are present, you are creating new vertices anyway and adding edge for those new vertices, throwing away the previous ones. That's why you have exactly one edge for each vertex in the end.
Just change the last line of addEdge to :
self.vertList[f].addNeighbour(self.vertList[t],cost)

Markov analysis - Return and recursion role

I am working on the solution of the Markov analysis in Think Python, but I do not understand the role of "Return" in the block code below.
As far as I known when the code reach return the function is cancel immediately, but isn't it unnecessary in this case, because there is a recursion here random_text(n-i) before the code reach the return statement, so the function will cancel only when the recursion is finish which mean when the for loop is over?? The question seem stupid but I am newbie in python and the recursion stuff is really confusing with me. I try to remove 'return' and it still run well.
def random_text(n=100):
start = random.choice(list(suffix_map.keys()))
for i in range(n):
suffixes = suffix_map.get(start, None)
if suffixes == None:
# if the start isn't in map, we got to the end of the
# original text, so we have to start again.
random_text(n-i)
return
word = random.choice(suffixes)
print(word, end=' ')
start = shift(start, word)
The full code is as below so you can understand what each function do.
from __future__ import print_function, division
import os
os.chdir(r"C:\Users\Hoang-Ngoc.Anh\Documents\WinPython-64bit 3.4.4.2\notebooks\docs")
import sys
import string
import random
# global variables
suffix_map = {} # map from prefixes to a list of suffixes
prefix = () # current tuple of words
def process_file(filename, order=2):
"""Reads a file and performs Markov analysis.
filename: string
order: integer number of words in the prefix
returns: map from prefix to list of possible suffixes.
"""
fp = open(filename)
skip_gutenberg_header(fp)
for line in fp:
for word in line.rstrip().split():
process_word(word, order)
def skip_gutenberg_header(fp):
"""Reads from fp until it finds the line that ends the header.
fp: open file object
"""
for line in fp:
if line.startswith('*END*THE SMALL PRINT!'):
break
def process_word(word, order=2):
"""Processes each word.
word: string
order: integer
During the first few iterations, all we do is store up the words;
after that we start adding entries to the dictionary.
"""
global prefix
if len(prefix) < order:
prefix += (word,)
return
try:
suffix_map[prefix].append(word)
except KeyError:
# if there is no entry for this prefix, make one
suffix_map[prefix] = [word]
prefix = shift(prefix, word)
def random_text(n=100):
"""Generates random wordsfrom the analyzed text.
Starts with a random prefix from the dictionary.
n: number of words to generate
"""
# choose a random prefix (not weighted by frequency)
start = random.choice(list(suffix_map.keys()))
for i in range(n):
suffixes = suffix_map.get(start, None)
if suffixes == None:
# if the start isn't in map, we got to the end of the
# original text, so we have to start again.
random_text(n-i)
return
# choose a random suffix
word = random.choice(suffixes)
print(word, end=' ')
start = shift(start, word)
def shift(t, word):
"""Forms a new tuple by removing the head and adding word to the tail.
t: tuple of strings
word: string
Returns: tuple of strings
"""
return t[1:] + (word,)
def main(script, filename='emma.txt', n=100, order=2):
try:
n = int(n)
order = int(order)
except ValueError:
print('Usage: %d filename [# of words] [prefix length]' % script)
else:
process_file(filename, order)
random_text(n)
print()
if __name__ == '__main__':
main(*sys.argv)

Good-Turing Smoothing Results

My implementation of Good-Turing smoothing produced the perplexity numbers below. These don't seem correct, though. Any intuitions as to why? I am using a corpus of 1,000 movie reviews from NLTK. My implementation seems correct (reproduced below).
1gram ppl: 1057.398218919647
2gram ppl: 3262.444941553032
3gram ppl: 68.10224173098685
4gram ppl: 4.542117543343882
5gram ppl: 1.7044134004884632
def good_turing_prob(ngram_occurences,freq_of_freq,total_ngram_count):
# unseen gram
if ngram_occurences == 0:
N_1 = freq_of_freq[1]
N = total_ngram_count
return N_1/N
#ngram is present in model
else:
# take closest count if count+1 is not present
N_c_plus_1 = freq_of_freq[min(freq_of_freq, key= lambda x:abs(x-(ngram_occurences+1)))]
N_c = freq_of_freq[min(freq_of_freq, key= lambda x:abs(x-ngram_occurences))]
good_turing_count = (ngram_occurences+1) * (N_c_plus_1/N_c)
return good_turing_count/total_ngram_count

Resources