Get text to csv format using python - python-3.x

I am able to get the data from pdf to text.
But now i need to get the data in csv format with table structure.
I tried it to get the table structure with but it didn't happen.Any inputs?
Also, i'm able to generate it through json.
Is there a way to get the result into table csv format?
any inputs ?
Below is the code i have used.
import boto3
import time
# Document
s3BucketName = "textractanalysisexample"
documentName = "sheet_example.pdf"
def startJob(s3BucketName, objectName):
response = None
client = boto3.client('textract')
response = client.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': s3BucketName,
'Name': objectName
}
})
return response["JobId"]
def isJobComplete(jobId):
# For production use cases, use SNS based notification
# Details at: https://docs.aws.amazon.com/textract/latest/dg/api-async.html
time.sleep(5)
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
#print("Job status: {}".format(status))
while(status == "IN_PROGRESS"):
time.sleep(5)
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
#print("Job status: {}".format(status))
return status
def getJobResults(jobId):
pages = []
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
pages.append(response)
print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
while(nextToken):
response = client.get_document_text_detection(JobId=jobId, NextToken=nextToken)
pages.append(response)
#print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
return pages
def lambda_handler(event, context):
jobId = startJob(s3BucketName, documentName)
#print("Started job with id: {}".format(jobId))
if(isJobComplete(jobId)):
response = getJobResults(jobId)
# Print detected text
for resultPage in response:
for item in resultPage["Blocks"]:
if item["BlockType"] == "LINE":
print (item["Text"]) ```

You can import CSV to write to a csv file like so:
import csv
with open('my_pdf.txt', 'r') as in_file:
stripped = (line.strip() for line in in_file)
lines = (line.split(",") for line in stripped if line)
with open('my_pdf.csv', 'w') as out_file:
writer = csv.writer(out_file)
writer.writerow(('title', 'intro'))
writer.writerows(lines)
You can just put in the rows you need, and this splits your data into comma separated values. You can see more information for CSV writer (and csv python in general) here (Python Docs).

Related

How to get multiple inputs (JSON files for me) in AWS Lambda from the same user's S3 bucket?

I have hereby attached my hardcoded python program which appends two JSON files in the S3 storage to be appended manually. Can someone please tell me how to get multiple input files (JSON files) from the S3 bucket automatically. I know we can do the same in python using *json in the directory of the program but I don't understand how to do the same in AWS Lambda.
Python Code:
import glob
result = []
for f in glob.glob("*.json"):
with open(f, "r") as infile:
result += json.load(infile)
with open("merge.json", "w") as outfile:
json.dump(result, outfile)
For doing the same in lambda I am able to do it for like 2 files, can someone please suggest how to do the same (like taking all JSON files from S3 automatically) in lambda. Thanks in advance.
import boto3
import json
s3_client = boto3.client("s3")
S3_BUCKET = 'bucket-for-json-files'
def lambda_handler(event, context):
object_key = "sample1.json" # replace object key
file_content = s3_client.get_object(Bucket=S3_BUCKET, Key=object_key)["Body"].read()
print(file_content)
object_key2 = "sample2.json" # replace object key
file_content2 = s3_client.get_object(Bucket=S3_BUCKET, Key=object_key2)["Body"].read()
print(file_content2)
result = []
result += json.loads(file_content)
result += json.loads(file_content2)
print(result)
Have followed the syntax from the documentation but I still get the timeout error.
import boto3
# Create a client
client = boto3.client('s3', region_name='us-east-1')
# Create a reusable Paginator
paginator = client.get_paginator('list_objects')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket='bucket-for-json-files')
for page in page_iterator:
print(page['Contents'])
Getting a timeout error:
import boto3
s3_client = boto3.client("s3")
S3_BUCKET = 'bucket-for-json-files'
def iterate_bucket_items(S3_BUCKET):
client = boto3.client('s3')
paginator = client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=S3_BUCKET)
for page in page_iterator:
if page['KeyCount'] > 0:
for item in page['Contents']:
yield item
for i in iterate_bucket_items(bucket='S3_BUCKET'):
print (i)
Have solved the issue with the help of #JeremyThompson, will attach my final code here:
import json
import boto3
import glob
def lambda_handler(event, context):
s3 = boto3.resource('s3')
bucket = s3.Bucket('bucket-for-json-files')
# Create a client
client = boto3.client('s3', region_name='us-east-1')
# Create a reusable Paginator
paginator = client.get_paginator('list_objects')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket='bucket-for-json-files')
result = []
for page in page_iterator:
result += page['Contents']
s3 = boto3.client('s3')
bucket = 'bucket-for-json-files'
merge = []
lst = []
for i in result:
cmd = i['Key']
print(cmd)
The above code prints the key from each json file available in the user's bucket.

Django - Batch Actions in a ListView - Select Rows and Return a Zip Archive in Response

Ok, first off, I know this is not the best way to handle serving files on a prod server.
But this site will be accessed by a small group of users, the request for these files
should be minimal, and the static files being served are very small - so I am trying to keep things simple.
I have a ListView Table where the user can select rows and then perform batch actions on the objects (eg. Delete, Export CSVs, Send CSV data to another location, etc). I've created a class based view to handle each of these batch actions. The only action that I can't get to work is when the user requests a download of a ZIP archive of the CSV data.
When the ZIP export action is selected from the ListView, the selected rows are written as individual CSV files into to a static file location, and then zipped into a single ZIP archive. But the FileResponse never returns the archive to the user as a download.
I've tried different ways of handling the response - redirecting to a new URL,
reverse to the ListView, different URL patterns, etc.
the method shown below that handles the create and zip of the CSV is called: batch_csv_zip
Can someone please point out where I'm going wrong. thanks.
class BatchActionView(LoginRequiredMixin, FormMixin, SuccessMessageMixin, TemplateView):
"""
Takes a list of pks to perform a batch action
"""
model = SlateDoc
template_name = 'slatedoc_batch_action.html'
permission_required = ('slatedoc.can_run_batch_actions')
permission_denied_message = "Permission Denied"
def dispatch(self, request, *args, **kwargs):
if not request.user.has_perm(permission_required):
messages.error(self.request, self.permission_denied_message)
return HttpResponseRedirect(request.META.get('HTTP_REFERER'))
else:
handler = getattr(self, request.method.lower(), self.http_method_not_allowed)
return handler(request, *args, **kwargs)
def post(self, request, *args, **kwargs):
if request.method == 'POST':
input = request.POST.get('inputString')
input_dict = eval(input)
modalID = input_dict["modalID"]
pkList = input_dict["pks"].split(",")
self.batch_actions(request, modalID, pkList)
return HttpResponseRedirect(self.get_success_url())
def get_success_url(self):
input = self.request.POST.get('inputString')
input_dict = eval(input)
modalID = input_dict["modalID"]
pks = input_dict["pks"]
if modalID == "csvModalSubmit":
return reverse_lazy('slatedoc-list')
# return reverse('batch-actions')
# return reverse('slatedoc-download', kwargs={'pks': pks})
# return reverse('slatedoc-download', kwargs={'file': f"{zipfilename}.zip"})
else:
return reverse_lazy('slatedoc-list')
def batch_actions(self, request, modalID, pkList):
"""
Check the modalID data from POST to trigger a Batch Action
"""
if modalID == "deleteModalSubmit":
self.batch_delete(pkList)
self.success_message = 'SlateDocs successfully deleted!'
if modalID == "csvModalSubmit":
self.batch_csv_zip(pkList)
self.success_message = 'SlateDocs successfully exported as a ZIP!'
if modalID == "vantageModalSubmit":
self.batch_csv_to_vantage(request, pkList)
self.success_message = 'SlateDocs successfully sent to Vantage!'
messages.success(self.request, self.success_message)
# return HttpResponseRedirect(self.get_success_url())
def batch_delete(self, pkList):
"""
Perform a soft delete for each pk in the pkList
"""
for pk in pkList:
slatedoc = SlateDoc.objects.get(id=pk)
self.object = slatedoc
slatedoc.soft_delete()
return
def batch_csv_zip(self, pkList):
"""
Generate CSV for each PK and then return a ZIP archive of the data
"""
t = time.time()
datetime = time.strftime('%Y%m%d%H%M', time.localtime(t))
csv_filepath = f"ngceng/static/docs/csv"
zippath = f"ngceng/static/docs/zip"
zipfilename = f"{datetime}_slatedoc_csv"
os.mkdir(os.path.join(csv_filepath,datetime))
for pk in pkList:
item = SlateDoc.objects.get(pk=pk)
slatedoc_resource = SlateDocResource()
queryset = SlateDoc.objects.filter(pk=pk)
dataset = slatedoc_resource.export(queryset)
filename = item.filename
filename_list = re.split('(\-+)|(\_+)', filename)
filepath = f"ngceng/static/docs/csv/{datetime}/{filename_list[0]}.csv"
with open(filepath, 'w', newline='') as f:
writer = csv.writer(f, delimiter=',')
writer.writerow(fieldnames_export)
writer.writerows(dataset)
f.close
# csvfilelist = [f for f in os.listdir(os.path.dirname(filepath)) if f.endswith(".csv")]
csv_f = os.path.abspath(os.path.join(csv_filepath, datetime))
zip_f = os.path.abspath(os.path.join(zippath, f"{zipfilename}.zip"))
os.chdir(zippath)
shutil.make_archive(zipfilename, 'zip', csv_f)
# os.remove(csv_f)
response = FileResponse(open(zip_f, 'rb').read())
response['content_type'] = 'application/zip'
response['Content-Disposition'] = f"attachment; filename = {zipfilename}.zip"
response['input'] = input
response['zipfilename'] = zipfilename
return response
# return HttpResponseRedirect(reverse('slatedoc-download', kwargs={'pks': ",".join(pkList)}))
# return HttpResponseRedirect(reverse('slatedoc-download', kwargs={'file': f"{zipfilename}.zip"}))
def batch_csv_to_vantage(self, request, pkList):
"""
Generate a CSV file for each pk in the pk List, send these CSVs to Vantage.
"""
for pk in pkList:
export_csv_to_vantage(request, pk)
return HttpResponseRedirect(self.get_success_url())

How to parse eml file and extract meta-data informations

I have an eml file with some attachments. I want to read text content in eml file and I want to extract meta-data information like(sender, from, cc, bcc, subject). Also I want to download the attachments as well. With the help of the below code I am only able to extract information/ text content in the body of the email.
import email
from email import policy
from email.parser import BytesParser
import glob
file_list = glob.glob('*.eml') # returns list of files
with open(file_list[2], 'rb') as fp: # select a specific email file from the list
msg = BytesParser(policy=policy.default).parse(fp)
text = msg.get_body(preferencelist=('plain')).get_content()
print(text)
There was module name emaildata which was available for Python 2 did the job.
Extracting MetaData Informations
import email
from emaildata.metadata import MetaData
message = email.message_from_file(open('message.eml'))
extractor = MetaData(message)
data = extractor.to_dict()
print data.keys()
Extracting Attachment Information
import email
from emaildata.attachment import Attachment
message = email.message_from_file(open('message.eml'))
for content, filename, mimetype, message in Attachment.extract(message):
print filename
with open(filename, 'w') as stream:
stream.write(content)
# If message is not None then it is an instance of email.message.Message
if message:
print "The file {0} is a message with attachments.".format(filename)
But this library is now deprecated and is of now use. Is there any other library that could extract the meta-data and attachment related information?
Meta-data information could be accessed using below code in Python 3.x
from email import policy
from email.parser import BytesParser
with open(eml_file, 'rb') as fp:
msg = BytesParser(policy=policy.default).parse(fp)
print('To:', msg['to'])
print('From:', msg['from'])
print('Subject:', msg['subject'])
Remaining header informations could be accessed using msg.keys()
For downloading attachments from an eml file you can use the below code:
import sys
import os
import os.path
from collections import defaultdict
from email.parser import Parser
eml_mail = 'your eml file'
output_dir = 'mention the directory where you want the files to be download'
def parse_message(filename):
with open(filename) as f:
return Parser().parse(f)
def find_attachments(message):
"""
Return a tuple of parsed content-disposition dict, message object
for each attachment found.
"""
found = []
for part in message.walk():
if 'content-disposition' not in part:
continue
cdisp = part['content-disposition'].split(';')
cdisp = [x.strip() for x in cdisp]
if cdisp[0].lower() != 'attachment':
continue
parsed = {}
for kv in cdisp[1:]:
key, val = kv.split('=')
if val.startswith('"'):
val = val.strip('"')
elif val.startswith("'"):
val = val.strip("'")
parsed[key] = val
found.append((parsed, part))
return found
def run(eml_filename, output_dir):
msg = parse_message(eml_filename)
attachments = find_attachments(msg)
print ("Found {0} attachments...".format(len(attachments)))
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
for cdisp, part in attachments:
cdisp_filename = os.path.normpath(cdisp['filename'])
# prevent malicious crap
if os.path.isabs(cdisp_filename):
cdisp_filename = os.path.basename(cdisp_filename)
towrite = os.path.join(output_dir, cdisp_filename)
print( "Writing " + towrite)
with open(towrite, 'wb') as fp:
data = part.get_payload(decode=True)
fp.write(data)
run(eml_mail, output_dir)
Have a look at: ParsEML it bulk extracts attachments from all eml files in a directory (originally from Stephan Hügel). And i used a modified version of MeIOC to easily extract all metadata in json format; if you want i can share that to.

Unable to return email message body

I have created a class to read emails and convert to a dataframe. This works for all HEADER data but I am unable to parse the message content and have tried numerous methods. I am following a tutorial from here http://beneathdata.com/how-to/email-behavior-analysis/
I have tried amending the def fetch_and_parse function in the code to select the message content but nothing seems to return. I have also tried amending the FETCH query but I'm lost.
from imaplib import IMAP4_SSL
import email as em
from email.utils import parsedate, parsedate_tz
from email.parser import HeaderParser
class OutlookAccount(object):
def __init__(self, username=None, password=None, folder=None):
self.username = username
self.password = password
self.folder = folder
def login(self):
self.conn = IMAP4_SSL('outlook.office365.com')
response = self.conn.login(self.username, self.password)
return response
def search(self, query, folder=None, readonly=False):
ff = self.folder if self.folder else folder
self.conn.select(ff, readonly)
resp, data = self.conn.search(None, query)
return data
def fetch(self, uids, query):
uid_arr = b','.join(uids[0].split())
resp, data = self.conn.fetch(uid_arr, query)
return data
def fetch_and_parse(self, uids, query):
data = self.fetch(uids, query)
parser = HeaderParser()
emails = []
for email in data:
if len(email) < 2:
continue
msg = em.message_from_bytes(email[1]).as_string()
emails.append(parser.parsestr(msg))
return emails
def load_parse_query(self, search_query, fetch_query, folder=None, readonly=False):
'''Perform search and fetch on an imap Gmail account. After fetching relevant info
from fetch query, parse into a dict-like email object, return list of emails.'''
uids = self.search(search_query, folder, readonly)
return self.fetch_and_parse(uids, fetch_query)
import numpy as np
import pandas as pd
import getpass
#import matplotlib.pyplot as plt
#import matplotlib.dates as dates
#import matplotlib.gridspec as gridspec
from datetime import timedelta, datetime, date
imap_password = getpass.getpass()
outlook = OutlookAccount(username='some#email.com', password=imap_password)
outlook.login()
daysback = 6000 # ~10yrs...make this whatever ya like
notsince = 0 # since now.
since = (date.today() - timedelta(daysback)).strftime("%d-%b-%Y")
before = (date.today() - timedelta(notsince)).strftime("%d-%b-%Y")
SEARCH = '(SENTSINCE {si} SENTBEFORE {bf})'.format(si=since, bf=before)
ALL_HEADERS = '(BODY.PEEK[HEADER])'
# Search and fetch emails!
received = outlook.load_parse_query(search_query=SEARCH,
fetch_query=ALL_HEADERS,
folder='"INBOX"')
#create function to convert to dataframe
def scrub_email(headers):
# IMAP sometimes returns fields with varying capitalization. Lowercase each header name.
return dict([(title.lower(), value) for title, value in headers])
df = pd.dataframe([scrub_email(email._headers) for email in received])
I want the dataframe to include all headers data and a field that includes the email message content/body.
Body needed to be selected as part of fetc_and_parse fusing something like:
if mime_msg.is_multipart():
for part in mime_msg.walk():
if part.is_multipart():
for subpart in part.get_payload():
if subpart.is_multipart():
for subsubpart in subpart.get_payload():
body = body + str(subsubpart.get_payload(decode=True)) + '\n'
else:
body = body + str(subpart.get_payload(decode=True)) + '\n'
else:
body = body + str(part.get_payload(decode=True)) + '\n'
else:
body = body + str(mime_msg.get_payload(decode=True)) + '\n'
body = bytes(body,'utf-8').decode('unicode-escape')

Invalid syntax within Lambda using Python 3.6 to pull from DynamoDB

I can't, for the life of me, figure out what is wrong with the following four lines of code.
def getAssetExistance(asset, element, table):
dynamoTable = dynamo.Table(table)
response = dynamoTable.query(KeyConditionExpression=Key(element).eq(asset)
return bool(response)
I am running this through aws Lambda and the log on cloudwatch is telling me the error is on the return line. This is the error (line 24 is the return line):
Syntax error in module 'lambda_function': invalid syntax (lambda_function.py, line 24)
In case this helps at all, here is the rest of the code:
################################
# Slack Lambda handler.
################################
import boto3
import os
import logging
import urllib
# Grab data from the environment.
BOT_TOKEN = os.environ["BOT_TOKEN"]
ASSET_TABLE = os.environ["ASSET_TABLE"]
REGION_NAME = os.getenv('REGION_NAME', 'us-east-1')
dynamo = boto3.client('dynamodb', region_name=REGION_NAME)
# Define the URL of the targeted Slack API resource.
SLACK_URL = "https://slack.com/api/chat.postMessage"
def getAssetExistance(asset, element, table):
dynamoTable = dynamo.Table(table)
response = dynamoTable.query(KeyConditionExpression=Key(element).eq(asset)
return bool(response)
def lambda_handler(data, context):
# Slack challenge answer.
if "challenge" in data:
return data["challenge"]
# Grab the Slack channel data.
slack_event = data['event']
slack_user = slack_event["user"]
slack_text = slack_event["text"]
channel_id = slack_event["channel"]
slack_userID = slack_user["ID"]
slack_reply = ""
# Ignore bot messages.
if "bot_id" in slack_event:
logging.warn("Ignore bot event")
else:
# Start data sift.
if slack_text.startswith("!networth"):
slack_reply = "Your networth is: "
elif slack_text.startwith("!price"):
command,asset = text.split()
slack_reply = "The price of a(n) %s is: " % (asset)
elif slack_text.startwith("!Addme"):
if not getAssetExistance(slack_userID, userID, ASSET_TABLE):
slack_reply = "Adding user: %s(%s)" % (slack_user, slack_userID)
dynamo.update_item(TableName=ASSET_TABLE,
Key={'userID':{'S':'slack_userID'},
AttributeUpdates= {
'resources':{
'Action': 'ADD',
'Value': {'N': '1000'}
}
}
)
else
slack_reply = "User %s(%s) already exists" % (slack_user, slack_userID)
# We need to send back three pieces of information:
data = urllib.parse.urlencode(
(
("token", BOT_TOKEN),
("channel", channel_id),
("text", slack_reply)
)
)
data = data.encode("ascii")
# Construct the HTTP request that will be sent to the Slack API.
request = urllib.request.Request(
SLACK_URL,
data=data,
method="POST"
)
# Add a header mentioning that the text is URL-encoded.
request.add_header(
"Content-Type",
"application/x-www-form-urlencoded"
)
# Fire off the request!
urllib.request.urlopen(request).read()
# Everything went fine.
return "200 OK"
Hopefully I am doing something dumb; I am pretty new to all this. Any help is much appreciated it. Thanks!
You skipped closed round bracket in this line:
response = dynamoTable.query(KeyConditionExpression=Key(element).eq(asset)
replace this line by:
response = dynamoTable.query(KeyConditionExpression=Key(element)).eq(asset)

Resources