how to use Google Cloud Translate API for translating bulk data? - python-3.x

I have a csv file of several thousands of rows in multiple languages and I am thinking of using google cloud translate API to translate foreign language text into English. I have used a simple code to find out if everything works properly and the code is running smoothly.
from google.cloud import translate_v2 as translate
from time import sleep
from tqdm.notebook import tqdm
import multiprocessing as mp
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "file path.py"
translate_client = translate.Client()
text = "Good Morning, My Name is X."
target ="ja"
output = translate_client.translate(text, target_language=target)
print(output)
I want to now import csv file (using pandas) and translate the text and save the output as a csv file. But don't know how should I do that. Most of the examples I found stop at translating sample text just like above.
Can anyone suggest how can I do this?

To translate the text in csv file and save the output in same CSV file using Google Cloud Translation API, you can use below code:
import csv
from pathlib import Path
def translate_text(target, text):
"""Translates text into the target language.
Target must be an ISO 639-1 language code.
See https://g.co/cloud/translate/v2/translate-reference#supported_languages
"""
import six
from google.cloud import translate_v2 as translate
translate_client = translate.Client()
if isinstance(text, six.binary_type):
text = text.decode("utf-8")
# Text can also be a sequence of strings, in which case this method
# will return a sequence of results for each text.
result = translate_client.translate(text, target_language=target)
# print(u"Text: {}".format(result["input"]))
# print(u"Translation: {}".format(result["translatedText"]))
# print(u"Detected source language: {}".format(result["detectedSourceLanguage"]))
return result["translatedText"]
def main(input_file, translate_to):
"""
Translate a text file and save as a CSV file
using Google Cloud Translation API
"""
input_file_path = Path(input_file)
target_lang = translate_to
output_file_path = input_file_path.with_suffix('.csv')
with open(input_file_path) as f:
list_lines = f.readlines()
total_lines = len(list_lines)
with open(output_file_path, 'w') as csvfile:
my_writer = csv.writer(csvfile, delimiter=',', quotechar='"')
my_writer.writerow(['id', 'original_text', 'translated_text'])
for i, each_line in enumerate(list_lines):
line_id = f'{i + 1:04}'
original_text = each_line.strip('\n') # Strip for the writer(*).
translated_text = translate_text(
target=target_lang,
text=each_line)
my_writer.writerow([line_id, original_text, translated_text]) # (*)
# Progress monitor, non-essential.
print(f"""
{line_id}/{total_lines:04}
{original_text}
{translated_text}""")
if __name__ == '__main__':
origin_file = input('Input text file? >> ')
output_lang = input('Output language? >> ')
main(input_file=origin_file,
translate_to=output_lang)
Example:
Translated text in input file to target language “es”, the output got stored in the same csv file.
Input:
new.csv
How are you doing,Is everything fine there
Do it today
Output:
new.csv
id,original_text,translated_text
0001,"How are you doing,Is everything fine there",¿Cómo estás? ¿Está todo bien allí?
0002,Do it today,Hazlo hoy

Related

Reading a ini (text) file in Python 3 and use the data to print strings on the screen independently from the text file encode format

in these days I asked a couple of questions related to string encode and file encode in Python but the real problem is harder then I supposed to be and now I have a clearer understanding of the problem.
Windows encoded text files in different formats depending by the language or the language group. So, I would be able to read a ini (text file) encoded in different formats because some keys contain strings that I need to display on forms and menues on the screen.
So I would like to write something (that has to work with any text file and encode format) similar to this code example:
from configparser import ConfigParser
import magic
def detect(iniFileName):
return magic.Magic(mime_encoding=True).from_file(iniFileName)
#---------------------------------------------------------------------------
encoding = detect('config.ini')
ini = ConfigParser()
ini.read('config.ini', encoding)
title = ini.get('USERENTRY-FORM', 'TITLE')
'''
then title is passed to the tk form
'''
UserEntryForm.setTitle(title)
if _DEBUG == True:
print("USERENTRY-FORM title=",title)
This is the new solution that seems to work better because recognizes better the encode format, AIniParser is a wrapper class arund ConfigParser.
from chardet import detect
def chardetPrint(filename):
text = open(filename, 'rb').read()
print(filename,": ",detect(text))
#---------------------------------------------------------------------------
def chardet(filename):
text = open(filename, 'rb').read()
print(filename,": ",detect(text)['encoding']) # only for test
return detect(text)['encoding']
if __name__ == '__main__':
from ainiparser import AIniParser
def testIniRead(filename, section, key):
with open(filename, "rb") as f:
line = f.readline()
print("line 1: ", line)
encode = chardet(filename)
ini = AIniParser(filename, encoding=encode)
ini._open()
text = ini.getString(section, key)
print(text)
def main():
testIniRead("../cdata/config-cro-16.ini", "CGUIINTF", "game-turn-text")
testIniRead("../bat/output-lva-16/idata/config-lva-16.ini", "CGUIINTF", "game-turn-text")
testIniRead("../idata/config-lva-16.ini", "CGUIINTF", "game-turn-text")
testIniRead("../idata/domande.ini", "D2", "B")
#---------------------------------------------------------------------------
main()
#---------------------------------------------------------------------------
#---------------------------------------------------------------------------
#---------------------------------------------------------------------------
#---------------------------------------------------------------------------
This solution seems recognizes better how files are encoded but I am still testing it.

Remove punctuation from list

I am working on setting up some usable data for semantic analysis. I have a corpus of raw text data that I am iterating over. I open the data, read it as a string, split into a list, and prepare the data to be built into a dataset in a later function. However, when I build the dataset, my most common words end up being punctuation. I need to remove all punctuation from the list before I process the data further.
import os
import collections
import string
import sys
import tensorflow as tf
import numpy as np
from six.moves import xrange
totalvocab = []
#Loop for: loop through all files in 'Data' directory
for subdir, dirs, files in os.walk('Data'):
for file in files:
filepath = subdir + os.sep + file
print(filepath)
#Function for: open file, convert input to string, split into list
def read_data(filepath):
with open(filepath, 'r') as f:
data = tf.compat.as_str(f.read()).split()
return data
#Run function on data, add file data to full data set.
filevocab = read_data(filepath)
totalvocab.extend(filevocab)
filevocab_size = len(filevocab)
print('File vocabulary size: %s' % filevocab_size)
totalvocab_size = len(totalvocab)
print('Total vocabulary size: %s' % totalvocab_size)
If I do the following:
def read_data(filepath):
with open(filepath, 'r') as f:
data = tf.compat.as_str(f.read())
data.translate(string.punctuation)
data.split()
return data
The words are split into individual letters.
Any other methods I have attempted have errored out.
There are a couple of errors in the code:
str.split() and str.translate() do not modify in place.
str.translate() expects a mapping.
To fix:
def read_data(filepath):
with open(filepath, 'r') as f:
data = tf.compat.as_str(f.read())
data = data.translate(str.maketrans('', '', string.punctuation))
return data.split()
Removing punctuation, may or may not do what you want, e.g. hyphenated words will become concatenated. You could alternatively identify punctuation that you would replace with a space.

Unicode characters to Turkish characters

(Edit: my original question is posted here, but the issue has been resolved and the code below is correct). I am looking for advice on how to convert Unicode characters to Turkish characters. The following code (posted online) scrapes tweets for an individual user and outputs a csv file, but the Turkish characters come out as in Unicode characters, i.e. \xc4. I am using Python 3 on a mac.
import sys
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
import tweepy #https://github.com/tweepy/tweepy
import csv
import string
import print
#Twitter API credentials
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""
def get_all_tweets(screen_name):
#Twitter only allows access to a users most recent 3240 tweets with this method
#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#initialize a list to hold all the tweepy Tweets
alltweets = []
#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = screen_name,count=200)
#save most recent tweets
alltweets.extend(new_tweets)
#save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
#keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
#print "getting tweets before %s" % (oldest)
#all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
#save most recent tweets
alltweets.extend(new_tweets)
#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
transform the tweepy tweets into a 2D array that will populate the csv
outtweets = [[tweet.id_str, tweet.created_at, tweet.text)] for tweet in alltweets]
write the csv
with open('%s_tweets.csv', 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(["id","created_at","text"])
writer.writerows(outtweets)
pass
if __name__ == '__main__':
pass in the username of the account you want to download
get_all_tweets("")
The csv module docs recommend you specify the encoding when you open the file. (and also that you use newline='' so the CSV module can do its own handling for newlines). Don't encode Unicode strings when writing rows.
import csv
with open('test.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['id','created_at','text'])
writer.writerows([[123, 456, 'Äβç']])

Loop only iterates first instance and how do I extract single filenames via os.walk?

Objective: A script that searches a directory for axograph files and performs as series of computations and returns either the values or saves it to CSV file.
Update: Data comes from Axographio raw files of intracellular recordings where each file has 4 waves of data. I seek to do the very basic ehpys calculations of properties:
input resistance,
half-width at half-maximum,
rheobase, SAG potential,
membrane time constant
afterhyperpolarisation.
I found python module "stimfit" which has useful libraries for this but it does not import properly. I´m currently troubleshooting this package.
This is my first script and first recordings, so if anyone has ideas, please share.
Problem 1. "f = axographio.read(filename)" only takes one argument with a string specific to "filename.axgd". I cant put this method alone since the resulting "file" is a list object and not the single instance of each filenames that I seek. I think that above function can work with "f=axographio.read(file)" if I can get that file object as a list or array of the filenames and extract them.
Problem 1 is now solved with f = axographio.read(os.path.join(root, file)) - it was unclear that os.path worked within the axographio module.
Problem 2. I can print all my files as list of strings but unable to extract and open each single file (even with file=open(blablabla.).
Update: I can either loop for each open file or save them to a list or array. I dont mind advice on best practice on this. axograph seems to work well with NumPy array but list is tempting since it has more flexibility. Any advice on this?
Note: its´s not necessary to make this complicated. The key point is to perform computations on multiple files at my choosing. Whether I save this into file or have more or less control is a matter of taste and time.
-*- coding: utf-8 -*-
"""
#author: Martenzi
"""
import os,sys
import numpy as np
import matplotlib.pylab as plt
import axographio
from scipy import stats
import re
import os
from collections import defaultdict
""" Search a directory for all Axograph files """
for root, dirs, files in os.walk("."):
for file in files:
if file.endswith(".axgd"):
f = axographio.read(os.path.join(root, file))
plt.show(file)
plt.plot(f.data[0],f.data[1])
plt.plot(f.data[0],f.data[2])
plt.plot(f.data[0],f.data[3])
plt.plot(f.data[0],f.data[4])
""" Below is various code snippets that I have tried out. They are not in specific order. I have tried things and stacked it as comments """
# if(line == 'foo'):
# line = next(irofile) #BEWARE, This could raise StopIteration!
# print line
# for i in file:
# cells = [];
# cells.append(file);
# for index, w in enumerate (loopme):
# cells = array ( [i] ,dtype=complex)
# print(file)
# for k in file:
# d = defaultdict(int)
# d[k].append()
# m=self.fileFormatRegex.match(file)
# self.processFile(root, open(os.path.join(r"/", root, file)), age, inString)
# infile = open(cells,"r")
# fullpath = os.path.walk(files)
# infile = open(file, "r")
# f = open(file ["r"][buffering])
# with open(infile, mode='r'):
## f = axographio.read(file)
# print(file)
# f = axographio.read(cell)
# with open(fullpath, 'r') as f:
# f = axographio.read(file)
# data = re.sub(r'(\s*function\s+.*\s*{\s*)',
# r'\1echo "The function starts here."',
# f.read())
# with open(fullpath, 'w') as f:
# f.write(data)

How convert an mbox to a JSON structure?

I am trying to convert an mbox to a JSON structure suitable for import into MongoDB i.e.
I am using mining social web second edition mailbox chapter but its not working properly.
I am trying to convert an mbox to a JSON structure suitable for import into MongoDB i.e.
I am using mining social web second edition mailbox chapter but its not working properly.
import sys
import mailbox
import email
import quopri
import json
import time
from BeautifulSoup import BeautifulSoup
from dateutil.parser import parse
MBOX = 'resources/ch06-mailboxes/data/enron.mbox'
OUT_FILE = MBOX + '.json'
def cleanContent(msg):
# Decode message from "quoted printable" format, but first
# re-encode, since decodestring will try to do a decode of its own
msg = quopri.decodestring(msg.encode('utf-8'))
# Strip out HTML tags, if any are present.
# Bail on unknown encodings if errors happen in BeautifulSoup.
try:
soup = BeautifulSoup(msg)
except:
return ''
return ''.join(soup.findAll(text=True))
# There's a lot of data to process, and the Pythonic way to do it is with a
# generator. See http://wiki.python.org/moin/Generators.
# Using a generator requires a trivial encoder to be passed to json for object
# serialization.
class Encoder(json.JSONEncoder):
def default(self, o): return list(o)
# The generator itself...
def gen_json_msgs(mb):
while 1:
msg = mb.next()
if msg is None:
break
yield jsonifyMessage(msg)
def jsonifyMessage(msg):
json_msg = {'parts': []}
for (k, v) in msg.items():
json_msg[k] = v.decode('utf-8', 'ignore')
# The To, Cc, and Bcc fields, if present, could have multiple items.
# Note that not all of these fields are necessarily defined.
for k in ['To', 'Cc', 'Bcc']:
if not json_msg.get(k):
continue
json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '').replace('\r', '')\
.replace(' ', '').decode('utf-8', 'ignore').split(',')
for part in msg.walk():
json_part = {}
if part.get_content_maintype() != 'text':
print >> sys.stderr, "Skipping MIME content in JSONification
({0})".format(part.get_content_maintype())
continue
json_part['contentType'] = part.get_content_type()
content = part.get_payload(decode=False).decode('utf-8', 'ignore')
json_part['content'] = cleanContent(content)
json_msg['parts'].append(json_part)
# Finally, convert date from asctime to milliseconds since epoch using the
# $date descriptor so it imports "natively" as an ISODate object in MongoDB
then = parse(json_msg['Date'])
millis = int(time.mktime(then.timetuple())*1000 + then.microsecond/1000)
json_msg['Date'] = {'$date' : millis}
return json_msg
mbox = mailbox.UnixMailbox(open(MBOX, 'rb'), email.message_from_file)
# Write each message out as a JSON object on a separate line
# for easy import into MongoDB via mongoimport
f = open(OUT_FILE, 'w')
for msg in gen_json_msgs(mbox):
if msg != None:
f.write(json.dumps(msg, cls=Encoder) + '\n')
f.close()
print "All done"
getting error:
80 # for easy import into MongoDB via mongoimport
81
---> 82 f = open(OUT_FILE, 'w')
83 for msg in gen_json_msgs(mbox):
84 if msg != None:
IOError: [Errno 13] Permission denied: 'resources/ch06-mailboxes/data/enron.mbox.json'
The code you mentioned became obsolete in Third Edition of Mining Social Web
I tried making a workable script that not just converts MBOX to JSON, but even extracts the Attachments to usable formats.
Link to the repo -
https://github.com/PS1607/mbox-to-json
Read the README file for usage instructions.
It seems that your problem is related to user permissions instead of Python. Line 82 tries to open a file in the "data" folder, but permission was denied. You should try executing your script using the sudo command from a terminal:
sudo python3 <your script name>
This should take care of the error you pointed out.
PS: Python 3 uses print as a function; line 88 should read
print('All done')

Resources