Unicode characters to Turkish characters - python-3.x

(Edit: my original question is posted here, but the issue has been resolved and the code below is correct). I am looking for advice on how to convert Unicode characters to Turkish characters. The following code (posted online) scrapes tweets for an individual user and outputs a csv file, but the Turkish characters come out as in Unicode characters, i.e. \xc4. I am using Python 3 on a mac.
import sys
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
import tweepy #https://github.com/tweepy/tweepy
import csv
import string
import print
#Twitter API credentials
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""
def get_all_tweets(screen_name):
#Twitter only allows access to a users most recent 3240 tweets with this method
#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#initialize a list to hold all the tweepy Tweets
alltweets = []
#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = screen_name,count=200)
#save most recent tweets
alltweets.extend(new_tweets)
#save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
#keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
#print "getting tweets before %s" % (oldest)
#all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
#save most recent tweets
alltweets.extend(new_tweets)
#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
transform the tweepy tweets into a 2D array that will populate the csv
outtweets = [[tweet.id_str, tweet.created_at, tweet.text)] for tweet in alltweets]
write the csv
with open('%s_tweets.csv', 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(["id","created_at","text"])
writer.writerows(outtweets)
pass
if __name__ == '__main__':
pass in the username of the account you want to download
get_all_tweets("")

The csv module docs recommend you specify the encoding when you open the file. (and also that you use newline='' so the CSV module can do its own handling for newlines). Don't encode Unicode strings when writing rows.
import csv
with open('test.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['id','created_at','text'])
writer.writerows([[123, 456, 'Äβç']])

Related

'utf-8' codec can't encode character '\udf81' in position 7: surrogates not allowed- Email Export

So I wrote a program that converts email files into a dataframe and then exports it into Excel through Outlook, however, I am having issues with a particular inbox that I believe has a non standard emoji in it that is breaking my Excel writer. I tried a few different versions of this I saw in the forums:
'\udf81'.encode('utf-16','surrogatepass').decode('utf-16')
However, I still get the issue about not being able to encode '\udf81'.
Here is my code:
import win32com.client as client
import pandas as pd
import time
start = time.perf_counter()
pd.set_option('display.max_columns', None)
outlook = client.Dispatch('Outlook.Application')
namespace = outlook.GetNameSpace('MAPI')
account = namespace.Folders['gianfabian52#gmail.com']
inbox = account.Folders['Inbox']
all_inbox = inbox.Items
#test inbox
print(inbox.Name)
#test account is correct
print(inbox.Parent.Name)
#count inbox items to verify code collects all items
print(inbox.Items.Count)
#collect all emails in inbox
AllEM = [message for message in inbox.Items]
#create DF
dict = {'Subject': [],'Body':[],'Sender_Email':[],'Account':[]
,'Recieved_Time':[],'Classification':[]}
df = pd.DataFrame(dict)
#'Classification':[]
EM = [message for message in inbox.Items]
#message.SenderEmailType- determine email type and if in directory
#MailItem.Sender.GetExchangeUser().PrimarySmtpAddress - supposed to pull email from directory-issues
#message.SenderEmailAddress - pull SMTP email
def EmailDF():
for message in EM:
df.loc[len(df.index)] = [message.Subject, message.Body, message.SenderEmailAddress, account, message.ReceivedTime, 'NA']
EmailDF()
#remove timezone to avoid errors, and to keep clarity in pandas package
df["Recieved_Time"]=df["Recieved_Time"].dt.tz_convert(None)
#sort by date and time
df = df.sort_values("Recieved_Time")
#print
print(df)
#writes out as csv without index
#csv
#df.to_csv('GianMail.csv', encoding='utf-8', index=False)
#excel
df.to_excel("GianMail.xlsx",engine='xlsxwriter', index=False)
finish = time.perf_counter()
print("Code executed in",round(finish-start,2),"seconds")

Dealing with IndexError: list index out of range

import requests
from bs4 import BeautifulSoup
from lxml import etree
import csv
with open('1_colonia.csv', 'r', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile, delimiter=';')
next(reader) # skip the header row
for row in reader:
url = row[0]
page = requests.get(url)
# parse the html with BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
# parse the HTML and print the result to the console
dom = etree.HTML(str(soup))
property = (dom.xpath('//*[#id="header"]/div/div[2]/h1'))
duration = (dom.xpath('//*[#id="header"]/div/p'))
price = (dom.xpath('//*[#id="price"]/div/div/span/span[3]'))
# save the data to a CSV file, adding the url as a column to the CSV file
with open('2_colonia.csv', 'a', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile, delimiter=';')
writer.writerow([url, property[0].text, duration[0].text,price[0].text])
'1_colonia.csv' contains a list of 815 links of properties on sale.
The script works until this message appears:
Traceback (most recent call last):
File "/home/flimflam/Python/colonia/2_colonia.py", line 23, in <module>
writer.writerow([url, property[0].text, duration[0].text, price[0].text])
IndexError: list index out of range
I am not sure where the problem lies. Can anyone help me out, please?
Thanks,
xpath returns lists (for the kind of expression you are using), so in your script property, duration and price are lists.
Depending on what you're searching, xpath can return 0, 1 or multiple elements.
So you must check whether there are results on the list before accessing them. If the list is empty and you try to access the first element (as in property[0], for instance) you will get an exception.
A simple way of checking if there's data on your lists before writing to the csv file would be:
with open('2_colonia.csv', 'a', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile, delimiter=';')
# check if the lists are not empty
if len(property) > 0 and len(duration) > 0 and len(price) > 0:
writer.writerow([url, property[0].text, duration[0].text, price[0].text])
else:
writer.writerow([url, 'error'])

how to use Google Cloud Translate API for translating bulk data?

I have a csv file of several thousands of rows in multiple languages and I am thinking of using google cloud translate API to translate foreign language text into English. I have used a simple code to find out if everything works properly and the code is running smoothly.
from google.cloud import translate_v2 as translate
from time import sleep
from tqdm.notebook import tqdm
import multiprocessing as mp
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "file path.py"
translate_client = translate.Client()
text = "Good Morning, My Name is X."
target ="ja"
output = translate_client.translate(text, target_language=target)
print(output)
I want to now import csv file (using pandas) and translate the text and save the output as a csv file. But don't know how should I do that. Most of the examples I found stop at translating sample text just like above.
Can anyone suggest how can I do this?
To translate the text in csv file and save the output in same CSV file using Google Cloud Translation API, you can use below code:
import csv
from pathlib import Path
def translate_text(target, text):
"""Translates text into the target language.
Target must be an ISO 639-1 language code.
See https://g.co/cloud/translate/v2/translate-reference#supported_languages
"""
import six
from google.cloud import translate_v2 as translate
translate_client = translate.Client()
if isinstance(text, six.binary_type):
text = text.decode("utf-8")
# Text can also be a sequence of strings, in which case this method
# will return a sequence of results for each text.
result = translate_client.translate(text, target_language=target)
# print(u"Text: {}".format(result["input"]))
# print(u"Translation: {}".format(result["translatedText"]))
# print(u"Detected source language: {}".format(result["detectedSourceLanguage"]))
return result["translatedText"]
def main(input_file, translate_to):
"""
Translate a text file and save as a CSV file
using Google Cloud Translation API
"""
input_file_path = Path(input_file)
target_lang = translate_to
output_file_path = input_file_path.with_suffix('.csv')
with open(input_file_path) as f:
list_lines = f.readlines()
total_lines = len(list_lines)
with open(output_file_path, 'w') as csvfile:
my_writer = csv.writer(csvfile, delimiter=',', quotechar='"')
my_writer.writerow(['id', 'original_text', 'translated_text'])
for i, each_line in enumerate(list_lines):
line_id = f'{i + 1:04}'
original_text = each_line.strip('\n') # Strip for the writer(*).
translated_text = translate_text(
target=target_lang,
text=each_line)
my_writer.writerow([line_id, original_text, translated_text]) # (*)
# Progress monitor, non-essential.
print(f"""
{line_id}/{total_lines:04}
{original_text}
{translated_text}""")
if __name__ == '__main__':
origin_file = input('Input text file? >> ')
output_lang = input('Output language? >> ')
main(input_file=origin_file,
translate_to=output_lang)
Example:
Translated text in input file to target language “es”, the output got stored in the same csv file.
Input:
new.csv
How are you doing,Is everything fine there
Do it today
Output:
new.csv
id,original_text,translated_text
0001,"How are you doing,Is everything fine there",¿Cómo estás? ¿Está todo bien allí?
0002,Do it today,Hazlo hoy

How to parse eml file and extract meta-data informations

I have an eml file with some attachments. I want to read text content in eml file and I want to extract meta-data information like(sender, from, cc, bcc, subject). Also I want to download the attachments as well. With the help of the below code I am only able to extract information/ text content in the body of the email.
import email
from email import policy
from email.parser import BytesParser
import glob
file_list = glob.glob('*.eml') # returns list of files
with open(file_list[2], 'rb') as fp: # select a specific email file from the list
msg = BytesParser(policy=policy.default).parse(fp)
text = msg.get_body(preferencelist=('plain')).get_content()
print(text)
There was module name emaildata which was available for Python 2 did the job.
Extracting MetaData Informations
import email
from emaildata.metadata import MetaData
message = email.message_from_file(open('message.eml'))
extractor = MetaData(message)
data = extractor.to_dict()
print data.keys()
Extracting Attachment Information
import email
from emaildata.attachment import Attachment
message = email.message_from_file(open('message.eml'))
for content, filename, mimetype, message in Attachment.extract(message):
print filename
with open(filename, 'w') as stream:
stream.write(content)
# If message is not None then it is an instance of email.message.Message
if message:
print "The file {0} is a message with attachments.".format(filename)
But this library is now deprecated and is of now use. Is there any other library that could extract the meta-data and attachment related information?
Meta-data information could be accessed using below code in Python 3.x
from email import policy
from email.parser import BytesParser
with open(eml_file, 'rb') as fp:
msg = BytesParser(policy=policy.default).parse(fp)
print('To:', msg['to'])
print('From:', msg['from'])
print('Subject:', msg['subject'])
Remaining header informations could be accessed using msg.keys()
For downloading attachments from an eml file you can use the below code:
import sys
import os
import os.path
from collections import defaultdict
from email.parser import Parser
eml_mail = 'your eml file'
output_dir = 'mention the directory where you want the files to be download'
def parse_message(filename):
with open(filename) as f:
return Parser().parse(f)
def find_attachments(message):
"""
Return a tuple of parsed content-disposition dict, message object
for each attachment found.
"""
found = []
for part in message.walk():
if 'content-disposition' not in part:
continue
cdisp = part['content-disposition'].split(';')
cdisp = [x.strip() for x in cdisp]
if cdisp[0].lower() != 'attachment':
continue
parsed = {}
for kv in cdisp[1:]:
key, val = kv.split('=')
if val.startswith('"'):
val = val.strip('"')
elif val.startswith("'"):
val = val.strip("'")
parsed[key] = val
found.append((parsed, part))
return found
def run(eml_filename, output_dir):
msg = parse_message(eml_filename)
attachments = find_attachments(msg)
print ("Found {0} attachments...".format(len(attachments)))
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
for cdisp, part in attachments:
cdisp_filename = os.path.normpath(cdisp['filename'])
# prevent malicious crap
if os.path.isabs(cdisp_filename):
cdisp_filename = os.path.basename(cdisp_filename)
towrite = os.path.join(output_dir, cdisp_filename)
print( "Writing " + towrite)
with open(towrite, 'wb') as fp:
data = part.get_payload(decode=True)
fp.write(data)
run(eml_mail, output_dir)
Have a look at: ParsEML it bulk extracts attachments from all eml files in a directory (originally from Stephan Hügel). And i used a modified version of MeIOC to easily extract all metadata in json format; if you want i can share that to.

How convert an mbox to a JSON structure?

I am trying to convert an mbox to a JSON structure suitable for import into MongoDB i.e.
I am using mining social web second edition mailbox chapter but its not working properly.
I am trying to convert an mbox to a JSON structure suitable for import into MongoDB i.e.
I am using mining social web second edition mailbox chapter but its not working properly.
import sys
import mailbox
import email
import quopri
import json
import time
from BeautifulSoup import BeautifulSoup
from dateutil.parser import parse
MBOX = 'resources/ch06-mailboxes/data/enron.mbox'
OUT_FILE = MBOX + '.json'
def cleanContent(msg):
# Decode message from "quoted printable" format, but first
# re-encode, since decodestring will try to do a decode of its own
msg = quopri.decodestring(msg.encode('utf-8'))
# Strip out HTML tags, if any are present.
# Bail on unknown encodings if errors happen in BeautifulSoup.
try:
soup = BeautifulSoup(msg)
except:
return ''
return ''.join(soup.findAll(text=True))
# There's a lot of data to process, and the Pythonic way to do it is with a
# generator. See http://wiki.python.org/moin/Generators.
# Using a generator requires a trivial encoder to be passed to json for object
# serialization.
class Encoder(json.JSONEncoder):
def default(self, o): return list(o)
# The generator itself...
def gen_json_msgs(mb):
while 1:
msg = mb.next()
if msg is None:
break
yield jsonifyMessage(msg)
def jsonifyMessage(msg):
json_msg = {'parts': []}
for (k, v) in msg.items():
json_msg[k] = v.decode('utf-8', 'ignore')
# The To, Cc, and Bcc fields, if present, could have multiple items.
# Note that not all of these fields are necessarily defined.
for k in ['To', 'Cc', 'Bcc']:
if not json_msg.get(k):
continue
json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '').replace('\r', '')\
.replace(' ', '').decode('utf-8', 'ignore').split(',')
for part in msg.walk():
json_part = {}
if part.get_content_maintype() != 'text':
print >> sys.stderr, "Skipping MIME content in JSONification
({0})".format(part.get_content_maintype())
continue
json_part['contentType'] = part.get_content_type()
content = part.get_payload(decode=False).decode('utf-8', 'ignore')
json_part['content'] = cleanContent(content)
json_msg['parts'].append(json_part)
# Finally, convert date from asctime to milliseconds since epoch using the
# $date descriptor so it imports "natively" as an ISODate object in MongoDB
then = parse(json_msg['Date'])
millis = int(time.mktime(then.timetuple())*1000 + then.microsecond/1000)
json_msg['Date'] = {'$date' : millis}
return json_msg
mbox = mailbox.UnixMailbox(open(MBOX, 'rb'), email.message_from_file)
# Write each message out as a JSON object on a separate line
# for easy import into MongoDB via mongoimport
f = open(OUT_FILE, 'w')
for msg in gen_json_msgs(mbox):
if msg != None:
f.write(json.dumps(msg, cls=Encoder) + '\n')
f.close()
print "All done"
getting error:
80 # for easy import into MongoDB via mongoimport
81
---> 82 f = open(OUT_FILE, 'w')
83 for msg in gen_json_msgs(mbox):
84 if msg != None:
IOError: [Errno 13] Permission denied: 'resources/ch06-mailboxes/data/enron.mbox.json'
The code you mentioned became obsolete in Third Edition of Mining Social Web
I tried making a workable script that not just converts MBOX to JSON, but even extracts the Attachments to usable formats.
Link to the repo -
https://github.com/PS1607/mbox-to-json
Read the README file for usage instructions.
It seems that your problem is related to user permissions instead of Python. Line 82 tries to open a file in the "data" folder, but permission was denied. You should try executing your script using the sudo command from a terminal:
sudo python3 <your script name>
This should take care of the error you pointed out.
PS: Python 3 uses print as a function; line 88 should read
print('All done')

Resources