How to import xlsx into dynamodb using python and boto3 - python-3.x

Trying to use LinuxAcademy posting of how to import Excel data into DynamoDB but the code posting is two years old and does not work. Any tips or suggestions would be very helpful.
Sorry I'm new to stackoverflow.
I was trying to take an excel spreadsheet and convert it to json then upload to DynamoDB like the posting on LinuxAcademy. The instructions are old and they use three scripts to upload one file.

Here is the code I used to create a lambda AWS python function.
The only problem is that it reads in the excel file and converts it to json and the file is too big to ingest into DynamoDB before the 5 minute timeout. I will probably convert it to step functions but this worked for me.
import boto3
import os
import sys
import uuid
import pandas as pd
s3_client = boto3.client('s3')
bucket = "serverless-record-storage-lambda"
def upload_to_dynamodb(report):
df=pd.read_excel(report)
df.columns=["APPLICATION", "FORM_NUMBER", "FILE_DATE", "STATUS_DATE", "STATUS", "STATUS_CODE", "EXPIRATION_DATE", "ESTIMATED COST", "REVISED_COST", "EXISTING_USE", "EXISTING_UNITS", "PROPOSED_USE","PROPOSED_UNITS","PLANSETS", "15_DAY_HOLD?" , "EXISTING_STORIES", "PROPOSED_STORIES", "ASSESSOR_STORIES", "VOLUNTARY", "PAGES", "BLOCK", "LOT", "STREET_NUMBER", "STREET_NUMBER_SFX", "AVS_STREET_NAME", "AVS_STREET_SFX", "UNIT", "UNIT_SFX", "FIRST_NAME", "LAST_NAME", "CONTRACTORPHONE",
"COMPANY_NAME", "STREET_NUMBER", "STREET", "STREET_SUFFIX", "CITY", "STATE", "ZIP_CODE", "CONTACT_NAME", "CONTACT_PHONE", "DESCRIPTION" ]
# Clean-up the data, change column types to strings to be on safer side :)
df=df.replace({'-': '0'}, regex=True)
df=df.fillna(0)
for i in df.columns:
df[i] = df[i].astype(str)
# Convert dataframe to list of dictionaries (JSON) that can be consumed by any no-sql database
myl=df.T.to_dict().values()
# Connect to DynamoDB using boto
resource = boto3.resource('dynamodb', region_name='us-west-2')
# Connect to the DynamoDB table
table = resource.Table('permitdata')
# Load the JSON object created in the step 3 using put_item method
for permit in myl:
table.put_item(Item=permit)
def handler(event, context):
for record in event['Records']:
print(record)
bucket = record['s3']['bucket']['name']
print(bucket)
key = record['s3']['object']['key']
print(key)
download_path = '/tmp/{}{}'.format(uuid.uuid4(), key)
upload_path = '/tmp/resized-{}'.format(key)
s3_client.download_file(bucket, key, download_path)
upload_to_dynamodb(download_path)
def main():
handler(event, None)
if __name__ == "__main__":
main()

Related

Reading a pdf in AWS lambda using PyMuPDF

I am trying to read a pdf in AWS lambda. The pdf is stored in an s3 bucket. I need to extract the text from pdf and translate them into any required language. I am able to run my code in my notebook but when I run it on Lambda I get this error message in my cloudwatch logs - task timed out after 3.01 seconds.
import fitz
import base64
from io import BytesIO
from PIL import Image
import boto3
def lambda_handler(event, context):
s3 = boto3.client('s3')
client_textract = boto3.client('textract')
translate_client = boto3.client('translate')
try:
print("Inside handler")
s3_bucket = "my_bucket"
pdf_file_name = 'sample.pdf'
pdf_file = s3.get_object(Bucket=s3_bucket, Key=pdf_file_name)
file_content = pdf_file['Body'].read()
print("Before reading ")
with fitz.open(stream=file_content, filetype="pdf") as doc:
Try to extend the timeout, which by default is set at 3 sec.
If that does not help, try to increase the allocated memory.
Also, you may consider pushing
s3 = boto3.client('s3')
client_textract = boto3.client('textract')
translate_client = boto3.client('translate')
out of your handler. Put it right after the imports. The function will run more efficiently on frequent invocation.

Send xlsx file using SES in AWS lambda function

SITUATION
I have created a lambda function the output of which is a an Excel file that gets saved to an S3 bucket. This part of the function works as expected.
As part of the functions operation I would also like to be able to email the generated Excel file to selected recipients.
CODE:
#IMPORT MODULES
import boto3
import pandas as pd
import io
from io import BytesIO
from io import StringIO
from datetime import date
import email
import email.header
import email.policy
from email.mime.text import MIMEText
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
def lambda_handler(event, context):
# GENERATE CURRENT DATE TO APPEND TO FILE
today = date.today()
date_val = today.strftime("%B %d, %Y")
# CREATE DATAFRAME
df = pd.DataFrame({'Data': [10, 22, 31, 43, 57, 99, 65, 74, 88]})
# EVALUATE VARIABLES AS ODD OR EVEN INTEGERS
even = df.loc[df['Data']%2 == 0]
odd = df.loc[df['Data']%2 != 0]
# SPECIFY BUKCET NAME AND OUTPUT FILE PATH
bucket = 'my-bucket'
filepath = 'output/My_Excel_File_{}.xlsx'.format(date_val)
# EXPORT MULTI-SHEET EXCEL FILE AND SEND TO S3 BUCKET
with io.BytesIO() as output:
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
even.to_excel(writer, sheet_name = 'Even')
odd.to_excel(writer, sheet_name = 'Odd')
data = output.getvalue()
s3 = boto3.resource('s3')
s3.Bucket(bucket).put_object(Key=filepath, Body=data)
WHAT I HAVE TRIED
I have tried to achieve my aim by appending the following code to my function by referring to various documentation, however this does not achieve the desired result.
# EXPORT MULTI-SHEET EXCEL FILE AND SEND TO S3 BUCKET
message = MIMEMultipart()
message['Subject'] = 'Email subject'
message['From'] = 'sender.email#domain.com'
message['To'] = 'recipient.email#domain.com')
# MESSAGE BODY
part = MIMEText('Thus is the email body string', 'html')
message.attach(part)
# ATTACHEMENT
if attachment_string: # if bytestring available
part = MIMEApplication(str.encode('attachment_string'))
else: # if file provided
part = MIMEApplication(s3.get_object(Bucket='my-bucket', Key=My_Excel_File_{}.xlsx'.format(date_val)).read())
part.add_header('Content-Disposition', 'attachment', filename='My_Excel_File_{}.xlsx'.format(date_val)')
message.attach(part)
response = client.send_raw_email(
Source=message['From'],
Destinations=['recipient.email#domain.com'],
RawMessage={
'Data': message.as_string()
}
)
There are AWS examples that dynamically create excel docs and email them. In this use cases, they are implemented in Java and the app is a web app. See this:
Creating the DynamoDB web application item tracker
Although this example uses the AWS SDK for Java V2, it will give you an idea and hopefully you can port to the programming language you are using.

Streaming Pipeline in Dataflow to Bigtable Python

I wanted to read the pubsub topic and write data to BigTable with the dataflow code written in Python. I could find the sample code in JAVA but not in Python.
How can we assign columns in a row from pubsub to different column families and write the data to Bigtable?
To write to Bigtable in a Dataflow pipeline, you'll need to create direct rows and pass them to the WriteToBigTable doFn. Here is a brief example that just passes in the row keys and adds one cell for each key nothing too fancy:
import datetime
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io.gcp.bigtableio import WriteToBigTable
from google.cloud.bigtable import row
class MyOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_argument(
'--bigtable-project',
help='The Bigtable project ID, this can be different than your '
'Dataflow project',
default='bigtable-project')
parser.add_argument(
'--bigtable-instance',
help='The Bigtable instance ID',
default='bigtable-instance')
parser.add_argument(
'--bigtable-table',
help='The Bigtable table ID in the instance.',
default='bigtable-table')
class CreateRowFn(beam.DoFn):
def process(self, key):
direct_row = row.DirectRow(row_key=key)
direct_row.set_cell(
"stats_summary",
b"os_build",
b"android",
datetime.datetime.now())
return [direct_row]
def run(argv=None):
"""Build and run the pipeline."""
options = MyOptions(argv)
with beam.Pipeline(options=options) as p:
p | beam.Create(["phone#4c410523#20190501",
"phone#4c410523#20190502"]) | beam.ParDo(
CreateRowFn()) | WriteToBigTable(
project_id=options.bigtable_project,
instance_id=options.bigtable_instance,
table_id=options.bigtable_table)
if __name__ == '__main__':
run()
I am just starting to explore this now and can link to a more polished version on GitHub once it's complete. Hope this helps you get started.
Building on top of what was proposed and adding PubSub, here’s a working version..
Pre requisites
GCS Bucket created (for Dataflow temp/staging files)
PubSub topic created
PubSub subscription created
BigTable instance created
BigTable table created
BigTable column family must be created (no visible error otherwise !)
Example of the latter with cbt:
cbt -instance test-instance createfamily test-table cf1
Code
Define and run the Dataflow pipeline.
# Packages
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io.gcp.bigtableio import WriteToBigTable
from google.cloud import pubsub_v1
# Classes
class CreateRowFn(beam.DoFn):
def __init__(self, pipeline_options):
self.instance_id = pipeline_options.bigtable_instance
self.table_id = pipeline_options.bigtable_table
def process(self, key):
from google.cloud.bigtable import row
import datetime
direct_row = row.DirectRow(row_key=key)
direct_row.set_cell(
'cf1',
'field1',
'value1',
timestamp=datetime.datetime.now())
yield direct_row
# Options
class XyzOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_argument('--bigtable_project', default='nested'),
parser.add_argument('--bigtable_instance', default='instance'),
parser.add_argument('--bigtable_table', default='table')
pipeline_options = XyzOptions(
save_main_session=True, streaming=True,
runner='DataflowRunner',
project=PROJECT,
region=REGION,
temp_location=TEMP_LOCATION,
staging_location=STAGING_LOCATION,
requirements_file=REQUIREMENTS_FILE,
bigtable_project=PROJECT,
bigtable_instance=INSTANCE,
bigtable_table=TABLE)
# Pipeline
def run (argv=None):
with beam.Pipeline(options=pipeline_options) as p:
input_subscription=f"projects/{PROJECT}/subscriptions/{SUBSCRIPTION}"
_ = (p
| 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(subscription=input_subscription).with_output_types(bytes)
| 'Conversion UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8'))
| 'Conversion string to row object' >> beam.ParDo(CreateRowFn(pipeline_options))
| 'Writing row object to BigTable' >> WriteToBigTable(project_id=pipeline_options.bigtable_project,
instance_id=pipeline_options.bigtable_instance,
table_id=pipeline_options.bigtable_table))
if __name__ == '__main__':
run()
Publish a message b"phone#1111" to PubSub topic (e.g. using the Python PublisherClient()).
Table content (using happybase)
b'phone#1111': {b'cf1:field1': b'value1'}
Row length: 1

'For' loop for reading multiple csv files from a google storage bucket into 1 Pandas DataFrame

I currently have 31 .csv files (all with the same identical structure - 60 cols wide and about 5000 rows deep) that I'm trying to read in from a google storage bucket into 1 pandas dataframe using a 'FOR' loop and I keep getting a 'timeout' error after 6 mins.
Upon doing some testing, I have noticed that I'm able to read one .csv file a time through it, but once I introduce 2 or more, I get the timeout error. This makes me think that my code is the problem rather than the size of the data.
Code is below (Should I be using pd.concat at any stage in the for loop?) help would be appreciated
def stage1eposdata(data, context):
from google.cloud import storage
from google.cloud import bigquery
import pandas as pd
import dask.dataframe as dd
import io
import numpy as np
import datetime as dt
from googleapiclient import discovery
from pandas.io.json import json_normalize
import google.auth
import math
destination_path1 = 'gs://staged_data/ddf-*_stet.csv'
## Source Buckets #
raw_epos_bucket = 'raw_data'
cleaned_epos_bucket = 'staged_data'
# Confirming Oauth #
storage_client = storage.Client()
bigquery_client = bigquery.Client()
# Confirming Connection #
raw_epos_data = storage_client.bucket(raw_epos_bucket)
cleaned_epos_data = storage_client.bucket(cleaned_epos_bucket)
df = pd.DataFrame()
for file in list(raw_epos_data.list_blobs(prefix='2019/')):
file_path="gs://{}/{}".format(file.bucket.name, file.name)
df = df.append(pd.read_csv(file_path),sort =False)
ddf = dd.from_pandas(df,npartitions=1, sort=True)
ddf.to_csv(destination_path1, index=True, sep=',')
Try this:
## Source Buckets #
raw_epos_bucket = 'raw_data'
cleaned_epos_bucket = 'staged_data'
# Confirming Oauth #
storage_client = storage.Client()
bigquery_client = bigquery.Client()
# Confirming Connection #
raw_epos_data = storage_client.bucket(raw_epos_bucket)
cleaned_epos_data = storage_client.bucket(cleaned_epos_bucket)
my_dataframe_list=[]
for file in list(raw_epos_data.list_blobs(prefix='2019/')):
file_path="gs://{}/{}".format(file.bucket.name, file.name)
my_dataframe_list.append(pd.read_csv(file_path))
df=pd.concat(my_dataframe_list)
ddf = dd.from_pandas(df,npartitions=1, sort=True)
ddf.to_csv(destination_path1, index=True, sep=',')
pd.concat joins a list of DataFrame. So in each iteration of the loop you keep the dataframe in the list my_dataframe_list and out of the loop concatenate the list.
if the columns match it should work.
It turns out that dask can do this type of thing very well due to its 'lazy' computation feature. My solution is below
## Source Buckets #
raw_epos_bucket = 'raw_data'
cleaned_epos_bucket = 'staged_data'
# Confirming Oauth #
storage_client = storage.Client()
bigquery_client = bigquery.Client()
# Confirming Connection #
raw_epos_data = storage_client.bucket(raw_epos_bucket)
cleaned_epos_data = storage_client.bucket(cleaned_epos_bucket)
my_dataframe_list = []
my_dataframe_list = dd.read_csv('gs://raw_data/*.csv')# '*' is wild card no need to do any more 'For' Loops!
ddf = dd.from_pandas(df,npartitions=1, sort=True)
ddf.to_csv(destination_path1, index=True, sep=',')

insert using pandas to_sql() missing data into clickhouse db

It's my first time using sqlalchemy and pandas to insert some data into a clickhouse db.
When I try to insert some data using clickhouse cli it works fine, but when I tried to do the same thing using sqlalchemy I don't know why one row is missing.
Have I done something wrong?
import pandas as pd
# created the dataframe
engine = create_engine(uri)
session = make_session(engine)
metadata = MetaData(bind=engine)
metadata.reflect(bind = engine)
conn = engine.connect()
df.to_sql('test', conn, if_exists = 'append', index = False)
Let's try this way:
import pandas as pd
from infi.clickhouse_orm.engines import Memory
from infi.clickhouse_orm.fields import UInt16Field, StringField
from infi.clickhouse_orm.models import Model
from sqlalchemy import create_engine
# define the ClickHouse table schema
class Test_Humans(Model):
year = UInt16Field()
first_name = StringField()
engine = Memory()
engine = create_engine('clickhouse://default:#localhost/test')
# create table
with engine.connect() as conn:
conn.connection.create_table(Test_Humans) # https://github.com/Infinidat/infi.clickhouse_orm/blob/master/src/infi/clickhouse_orm/database.py#L142
pdf = pd.DataFrame.from_records([
{'year': 1994, 'first_name': 'Vova'},
{'year': 1995, 'first_name': 'Anja'},
{'year': 1996, 'first_name': 'Vasja'},
{'year': 1997, 'first_name': 'Petja'},
# ! sqlalchemy-clickhouse ignores the last item so add fake one
{}
])
pdf.to_sql('test_humans', engine, if_exists='append', index=False)
Take into account that sqlalchemy-clickhouse ignores the last item so add fake one (see source code and related issue 10).

Resources