Getting <generator object <genexpr> at 0x1193417d8> as output - python-3.x

#Reading files with txt extension
def get_sentences():
for root, dirs, files in os.walk("/Users/Documents/test1"):
for file in files:
if file.endswith(".txt"):
x_ = codecs.open(os.path.join(root,file),"r", "utf-8-sig")
for lines in x_.readlines():
yield lines
formoreprocessing = get_sentences()
#Tokenizing sentences of the text files
from nltk.tokenize import sent_tokenize
for i in formoreprocessing:
raw_docs = sent_tokenize(i)
tokenized_docs = [sent_tokenize(i) for sent in raw_docs]
'''Removing Stop Words'''
stopword_removed_sentences = []
from nltk.corpus import stopwords
stopset = set(stopwords.words("English"))
def strip_stopwords(sentence):
return ' '.join(word for word in sentence.split() if word not in stopset)
stopword_removed_sentences = (strip_stopwords(sentence) for sentence in raw_docs)
print(stopword_removed_sentences)
The above code is not printing what it is supposed to be. Instead it is throwing:
at 0x1193417d8>
as output. What is the mistake here?
I am using python 3.5.

Try print(list(stopword_removed_sentences)). This turns the generator into a list before printing it

This is the final answer, the provides the best result resolving the problem that i have mentioned in my previous comment.
from nltk.tokenize import sent_tokenize
raw_docs = sent_tokenize(''.join(formoreprocessing))
#print(raw_docs)
tokenized_docs = [sent_tokenize(''.join(formoreprocessing)) for sent in raw_docs]
#Removing Stop Words
stopword_removed_sentences = []
from nltk.corpus import stopwords
stopset = set(stopwords.words("English"))
def strip_stopwords(sentence):
return ' '.join(word for word in sentence.split() if word not in stopset)
stopword_removed_sentences = (strip_stopwords(sentence) for sentence in raw_docs)
print(list(stopword_removed_sentences))

Related

with open(os.path.join(directory, filename), 'r','utf-8') as file: TypeError: 'str' object cannot be interpreted as an integer

why iam getting this error
import os
import string
from nltk.corpus import stopwords
# Get a list of English stop words
stop_words = stopwords.words('english')
# Set the input and output directories
input_dir = 'C:\\Users\\acer\\OneDrive\\Desktop\\extracted_data'
output_dir = 'C:\\Users\\acer\OneDrive\\Desktop\\cleaned extracted data'
# Iterate over the files in the input directory
for filename in os.listdir(input_dir):
# Read the text file
with open(os.path.join(input_dir, filename), 'r','utf-8') as file:
text = file.read()
# Split the text into a list of words
words = text.split()
# Remove punctuation from each word
words = [word.translate(str.maketrans('', '', string.punctuation)) for word in words]
# Remove stop words from the list of words
cleaned_words = [word for word in words if word.lower() not in stop_words]
# Join the cleaned words into a single string
cleaned_text = ' '.join(cleaned_words)
# Write the cleaned text to a new file
with open(os.path.join(output_dir, filename), 'w',"utf-8") as file:
file.write(cleaned_text)
this is my code,i was cleaning my extracted data from a website using stopwords suddenly this error occured
I just solved the error by importing codecs. If you want to open a binary file in read mode and specify the encoding, you can use the codecs module like this:
import codecs
with codecs.open(os.path.join(directory, filename), 'r', encoding='utf-8')
as file:

how to use Google Cloud Translate API for translating bulk data?

I have a csv file of several thousands of rows in multiple languages and I am thinking of using google cloud translate API to translate foreign language text into English. I have used a simple code to find out if everything works properly and the code is running smoothly.
from google.cloud import translate_v2 as translate
from time import sleep
from tqdm.notebook import tqdm
import multiprocessing as mp
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "file path.py"
translate_client = translate.Client()
text = "Good Morning, My Name is X."
target ="ja"
output = translate_client.translate(text, target_language=target)
print(output)
I want to now import csv file (using pandas) and translate the text and save the output as a csv file. But don't know how should I do that. Most of the examples I found stop at translating sample text just like above.
Can anyone suggest how can I do this?
To translate the text in csv file and save the output in same CSV file using Google Cloud Translation API, you can use below code:
import csv
from pathlib import Path
def translate_text(target, text):
"""Translates text into the target language.
Target must be an ISO 639-1 language code.
See https://g.co/cloud/translate/v2/translate-reference#supported_languages
"""
import six
from google.cloud import translate_v2 as translate
translate_client = translate.Client()
if isinstance(text, six.binary_type):
text = text.decode("utf-8")
# Text can also be a sequence of strings, in which case this method
# will return a sequence of results for each text.
result = translate_client.translate(text, target_language=target)
# print(u"Text: {}".format(result["input"]))
# print(u"Translation: {}".format(result["translatedText"]))
# print(u"Detected source language: {}".format(result["detectedSourceLanguage"]))
return result["translatedText"]
def main(input_file, translate_to):
"""
Translate a text file and save as a CSV file
using Google Cloud Translation API
"""
input_file_path = Path(input_file)
target_lang = translate_to
output_file_path = input_file_path.with_suffix('.csv')
with open(input_file_path) as f:
list_lines = f.readlines()
total_lines = len(list_lines)
with open(output_file_path, 'w') as csvfile:
my_writer = csv.writer(csvfile, delimiter=',', quotechar='"')
my_writer.writerow(['id', 'original_text', 'translated_text'])
for i, each_line in enumerate(list_lines):
line_id = f'{i + 1:04}'
original_text = each_line.strip('\n') # Strip for the writer(*).
translated_text = translate_text(
target=target_lang,
text=each_line)
my_writer.writerow([line_id, original_text, translated_text]) # (*)
# Progress monitor, non-essential.
print(f"""
{line_id}/{total_lines:04}
{original_text}
{translated_text}""")
if __name__ == '__main__':
origin_file = input('Input text file? >> ')
output_lang = input('Output language? >> ')
main(input_file=origin_file,
translate_to=output_lang)
Example:
Translated text in input file to target language “es”, the output got stored in the same csv file.
Input:
new.csv
How are you doing,Is everything fine there
Do it today
Output:
new.csv
id,original_text,translated_text
0001,"How are you doing,Is everything fine there",¿Cómo estás? ¿Está todo bien allí?
0002,Do it today,Hazlo hoy

Lemmatize df column

I am trying to lemmatize content in a df but the function I wrote isn't working. Prior to trying to lemmatize the data in the column looked like this.
Then I ran the following code:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(w) for w in text]
df['content'] = df["content"].apply(lemmatize_text)
print(df.content)
Now the content column looks like this:
I'm not sure what i did wrong, but I am just trying to lemmatize the data in the content column. Any help would be greatly appreciated.
You are lemmatizing each char instead of word. Your function should look like this instead:
def lemmatize_text(text):
lemmatizer = WordNetLemmatizer()
return ' '.join([lemmatizer.lemmatize(w) for w in text.split(' ')])

How to extract the last names from text file using nltk in python

import re
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
from nltk.corpus import wordnet
inputfile = open('file.txt', 'r')
String= inputfile.read()
def last_name(resume_text):
tokenized_sentences = nltk.sent_tokenize(resume_text)
a_list=[]
for sentence in tokenized_sentences:
a_list=(sentence.split())
s1=a_list[1:]
sentence1=''.join(s1)
tokenized_sentences = nltk.sent_tokenize(sentence1)
for sentence in tokenized_sentences:
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence), tagset='universal')):
if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
chunk = chunk[0]
(name, tag) = chunk
if tag == 'NOUN':
return name
if __name__ == '__main__':
lastname= last_name(String)
print(lastname)
I want to extract the last name from a resume. It returns the first name correctly but the second name is wrong.
How can I solve this issue?

Remove punctuation from list

I am working on setting up some usable data for semantic analysis. I have a corpus of raw text data that I am iterating over. I open the data, read it as a string, split into a list, and prepare the data to be built into a dataset in a later function. However, when I build the dataset, my most common words end up being punctuation. I need to remove all punctuation from the list before I process the data further.
import os
import collections
import string
import sys
import tensorflow as tf
import numpy as np
from six.moves import xrange
totalvocab = []
#Loop for: loop through all files in 'Data' directory
for subdir, dirs, files in os.walk('Data'):
for file in files:
filepath = subdir + os.sep + file
print(filepath)
#Function for: open file, convert input to string, split into list
def read_data(filepath):
with open(filepath, 'r') as f:
data = tf.compat.as_str(f.read()).split()
return data
#Run function on data, add file data to full data set.
filevocab = read_data(filepath)
totalvocab.extend(filevocab)
filevocab_size = len(filevocab)
print('File vocabulary size: %s' % filevocab_size)
totalvocab_size = len(totalvocab)
print('Total vocabulary size: %s' % totalvocab_size)
If I do the following:
def read_data(filepath):
with open(filepath, 'r') as f:
data = tf.compat.as_str(f.read())
data.translate(string.punctuation)
data.split()
return data
The words are split into individual letters.
Any other methods I have attempted have errored out.
There are a couple of errors in the code:
str.split() and str.translate() do not modify in place.
str.translate() expects a mapping.
To fix:
def read_data(filepath):
with open(filepath, 'r') as f:
data = tf.compat.as_str(f.read())
data = data.translate(str.maketrans('', '', string.punctuation))
return data.split()
Removing punctuation, may or may not do what you want, e.g. hyphenated words will become concatenated. You could alternatively identify punctuation that you would replace with a space.

Resources