Reading contents from gzip file in python dataframe which is available in AWS S3.
Want to convert dataframe.
In case if you are trying to get json data to dataframe Here is the code.
import pandas as pd
import boto3
from io import StringIO
import gzip
resource = boto3.resource('s3',aws_access_key_id = '',
aws_secret_access_key = '')
list_keys= []
lst = []
for key in client.list_objects(Bucket='bucket_name',Prefix = 'Folder name')['Contents']:
list_keys.append(key["Key"])
for key in list_keys:
try:
obj = resource.Object("bucket_name", key)
with gzip.GzipFile(fileobj=obj.get()["Body"]) as gzipfile:
temp_data = pd.read_json(StringIO(gzipfile.read().decode('UTF-8')),lines=True)
lst.append(temp_data)
except Exception as e:
pass
df = pd.concat(lst,ignore_index = True)
Related
I am trying to read a json file from project B in Google cloud using a service account from project A. The service account in project A is granted read role in the project B. But when I am trying to open the json file I am getting:
"FileNotFoundError: [Errno 2] No such file or directory: 'gs://x.json'. The file x.json does exist.
I checked the list of privileges (storage.objects.get ,storage.objects.list) to read a file from https://cloud.google.com/storage/docs/access-control/iam-permissions.
Any help is appreciated. Thanks.
from google.cloud import bigquery
from analytics import Clients, ClientType
from datetime import datetime, date, timedelta
from pytz import timezone
from typing import List
from pyarrow import json as pyj
import pyarrow.parquet as pq
import newlinejson as nlj
bigquery_client = Clients.get_client(ClientType.STORAGE, name='w')
write_client = Clients.get_client(ClientType.BIGQUERY, name='w')
k_client = Clients.get_client(ClientType.BIGQUERY, name='w')
bucket ='update'
file_name_prefix = "al_"
target_table = k_client.get_table("w.junk.json_table1")
def get_dates() -> List[str]:
"""
Return dates for which log files have to be checked
"""
end = date.fromisoformat(str(datetime.date(datetime.now(timezone("EST")))))
return [str(end - timedelta(days=1)), str(end)]
def get_bucket_files(bucket, file_name_prefix):
# if full_path:
path = "gs://{}/{}"
#path = "https://storage.googleapis.com/{}/{}"
return [
path.format(bucket, b.name)
for b in bigquery_client.list_blobs(bucket, prefix=file_name_prefix)
]
def get_latest_file() -> str:
"""
Get all files for the current prefix between start and end date
"""
files = []
files_json = []
for d in get_dates():
prefix = file_name_prefix + d[4:] + "-" + d[:4]
files += get_bucket_files(bucket, file_name_prefix)
for k in files:
filename = k.split('/')[-1]
if 'json' in filename:
files_json.append(k)
return max(files_json)
def pipeline():
job_config = bigquery.LoadJobConfig(
# schema=[
# bigquery.SchemaField("name", "STRING")
# ],
autodetect=True,
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
)
f = get_latest_file()
print(f)
table = pyj.read_json(f)
# pq.write_table(target_table, table.parquet)
# with nlj.open(f) as src:
# with nlj.open('out.json', 'w') as dst:
# for line in src:
# dst.write(line)
# k_client.load_table_from_uri(
# f, target_table, job_config=job_config
# ).result()
pipeline()
You may consider and try the below approach in listing objects in buckets:
from google.cloud import storage
my_bucket = 'your-bucket-name'
my_prefix = 'al_'
client = storage.Client()
def get_bucket_files(bucket, file_name_prefix):
for blob in client.list_blobs(bucket, prefix=file_name_prefix):
print(str(blob))
get_bucket_files(my_bucket, my_prefix)
Output:
I have hereby attached my hardcoded python program which appends two JSON files in the S3 storage to be appended manually. Can someone please tell me how to get multiple input files (JSON files) from the S3 bucket automatically. I know we can do the same in python using *json in the directory of the program but I don't understand how to do the same in AWS Lambda.
Python Code:
import glob
result = []
for f in glob.glob("*.json"):
with open(f, "r") as infile:
result += json.load(infile)
with open("merge.json", "w") as outfile:
json.dump(result, outfile)
For doing the same in lambda I am able to do it for like 2 files, can someone please suggest how to do the same (like taking all JSON files from S3 automatically) in lambda. Thanks in advance.
import boto3
import json
s3_client = boto3.client("s3")
S3_BUCKET = 'bucket-for-json-files'
def lambda_handler(event, context):
object_key = "sample1.json" # replace object key
file_content = s3_client.get_object(Bucket=S3_BUCKET, Key=object_key)["Body"].read()
print(file_content)
object_key2 = "sample2.json" # replace object key
file_content2 = s3_client.get_object(Bucket=S3_BUCKET, Key=object_key2)["Body"].read()
print(file_content2)
result = []
result += json.loads(file_content)
result += json.loads(file_content2)
print(result)
Have followed the syntax from the documentation but I still get the timeout error.
import boto3
# Create a client
client = boto3.client('s3', region_name='us-east-1')
# Create a reusable Paginator
paginator = client.get_paginator('list_objects')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket='bucket-for-json-files')
for page in page_iterator:
print(page['Contents'])
Getting a timeout error:
import boto3
s3_client = boto3.client("s3")
S3_BUCKET = 'bucket-for-json-files'
def iterate_bucket_items(S3_BUCKET):
client = boto3.client('s3')
paginator = client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=S3_BUCKET)
for page in page_iterator:
if page['KeyCount'] > 0:
for item in page['Contents']:
yield item
for i in iterate_bucket_items(bucket='S3_BUCKET'):
print (i)
Have solved the issue with the help of #JeremyThompson, will attach my final code here:
import json
import boto3
import glob
def lambda_handler(event, context):
s3 = boto3.resource('s3')
bucket = s3.Bucket('bucket-for-json-files')
# Create a client
client = boto3.client('s3', region_name='us-east-1')
# Create a reusable Paginator
paginator = client.get_paginator('list_objects')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket='bucket-for-json-files')
result = []
for page in page_iterator:
result += page['Contents']
s3 = boto3.client('s3')
bucket = 'bucket-for-json-files'
merge = []
lst = []
for i in result:
cmd = i['Key']
print(cmd)
The above code prints the key from each json file available in the user's bucket.
I am new to Python and AWS Glue.
I am trying to merge few excel files in a S3 source bucket and generate 1 output file (csv) in a target S3 bucket. I am able to read and generate the output file with merged data but the only problem is that the header is repeating from each file.
Can someone help to debug to remove the repeating headers?
Below is my code:
import pandas as pd
import glob
import xlrd
import openpyxl
import boto3
import io
import json
import os
from io import StringIO
import numpy as np
s3 = boto3.resource('s3')
bucket = s3.Bucket('test bucket')
prefix_objs = bucket.objects.filter(Prefix='source/file')
prefix_df = []
for obj in prefix_objs:
key = obj.key
print(key)
temp = pd.read_excel(obj.get()['Body'], encoding='utf8')
prefix_df.append(temp)
bucket = 'test bucket'
csv_buffer = StringIO()
for current_df in prefix_df:
current_df.to_csv(csv_buffer, index = None)
print(current_df)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'merge.csv').put(Body=csv_buffer.getvalue())
Please help!
Regards,
Vijay
Change this line and add the parameter header.
temp = pd.read_excel(obj.get()['Body'], encoding='utf8')
to
temp = pd.read_excel(obj.get()['Body'], encoding='utf8', header=1)
or
temp = pd.read_excel(obj.get()['Body'], encoding='utf8', skiprows=1)
You need to test the header value, because sometimes the header starts not in the first row.
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html
In my s3 bucket directory, I have multiple files like .csv, .log, .txt , etc. But I need to read-only .log files from a single directory and append them using boto3. I tried below code but it's reading all files data, not able to restrict using *.log and also the result is coming as a single line separated by '\n' as mentioned below.
How can I read only log files and merge them and the result should come like line by line.
import boto3
import pandas as pd
import csv
s3 = boto3.resource('s3')
my_bucket = s3.Bucket('my_bucket')
lst = []
for object in my_bucket.objects.filter(Prefix="bulk_data/all_files/"):
print(object.key)
bdy = object.get()['Body'].read().decode('utf-8')
lst.append(bdy)
bdy = ''
print(lst)
lst output coming like this with '\n' as separator.
'12345,6006,7290,7200,JKHBJ,S,55\n44345,6996,6290,7288,JKHkk,R,57\n..........'
I should get something like below:
12345,6006,7290,7200,JKHBJ,S,55
44345,6996,6290,7288,JKHkk,R,57
...
The filter takes only prefix, not suffix. Thus you have to filter it yourself, for example using:
import boto3
import pandas as pd
import csv
s3 = boto3.resource('s3')
my_bucket = s3.Bucket('my_bucket')
lst = []
for s3obj in my_bucket.objects.filter(Prefix="attachments/"):
# skip s3 objects not ending with csv
if (not s3obj.key.endswith('csv')): continue
print(s3obj.key)
bdy = s3obj.get()['Body'].read().decode('utf-8')
lst.append(bdy)
bdy = ''
#print(lst)
for file_str in lst:
for line in file_str.split('\n'):
print(line)
Objective of this code is to read an existing CSV file from a specified S3 bucket into a Dataframe, filter the dataframe for desired columns, and then write the filtered Dataframe to a CSV object using StringIO that I can upload to a different S3 bucket.
Everything works right now except the code block for the function "prepare_file_for_upload". Below is the full code block:
from io import StringIO
import io #unsued at the moment
import logging
import pandas as pd
import boto3
from botocore.exceptions import ClientError
FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
#S3 parameters
source_bucket = 'REPLACE'
source_folder = 'REPLACE/'
dest_bucket = 'REPLACE'
dest_folder = 'REPLACE'
output_name = 'REPLACE'
def get_file_name():
try:
s3 = boto3.client("s3")
logging.info(f'Determining filename from: {source_bucket}/{source_folder}')
bucket_path = s3.list_objects(Bucket=source_bucket, Prefix=source_folder)
file_name =[key['Key'] for key in bucket_path['Contents']][1]
logging.info(file_name)
return file_name
except ClientError as e:
logging.info(f'Unable to determine file name from bucket {source_bucket}/{source_folder}')
logging.info(e)
def get_file_data(file_name):
try:
s3 = boto3.client("s3")
logging.info(f'file name from get data: {file_name}')
obj = s3.get_object(Bucket=source_bucket, Key=file_name)
body = obj['Body']
body_string = body.read().decode('utf-8')
file_data = pd.read_csv(StringIO(body_string))
#logging.info(file_data)
return file_data
except ClientError as e:
logging.info(f'Unable to read {file_name} into datafame')
logging.info(e)
def filter_file_data(file_data):
try:
all_columns = list(file_data.columns)
columns_used = ('col_1', 'col_2', 'col_3')
desired_columns = [x for x in all_columns if x in columns_used]
filtered_data = file_data[desired_columns]
logging.info(type(filtered_data)) #for testing
return filtered_data
except Exception as e:
logging.info('Unable to filter file')
logging.info(e)
The block below is where I am attempting to write the existing DF that was passed to the function using "to_csv" method with StringIO instead of creating a local file. to_csv will write to a local file but does not work with buffer (yes, I tried putting the buffer cursor to start position after and still nothing)
def prepare_file_for_upload(filtered_data): #this is the function block where I am stuck
try:
buffer = StringIO()
output_name = 'FILE_NAME.csv'
#code below is writing to file but can not get to write to buffer
output_file = filtered_data.to_csv(buffer, sep=',')
df = pd.DataFrame(buffer) #for testing
logging.info(df) #for testing
return output_file
except Exception as e:
logging.info(f'Unable to prepare {output_name} for upload')
logging.info(e)
def upload_file(adjusted_file):
try:
#dest_key = f'{dest_folder}/{output_name}'
dest_key = f'{output_name}'
s3 = boto3.resource('s3')
s3.meta.client.upload_file(adjusted_file, dest_bucket, dest_key)
except ClientError as e:
logging.info(f'Unable to upload {output_name} to {dest_key}')
logging.info(e)
def execute_program():
file_name = get_file_name()
file_data = get_file_data(file_name)
filtered_data = filter_file_data(file_data)
adjusted_file = prepare_file_for_upload(filtered_data)
upload_file = upload_file(adjusted_file)
if __name__ == '__main__':
execute_program()
Following solution worked for me:
csv_buffer = StringIO()
output_file = filtered_data.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object(dest_bucket, output_name).put(Body=csv_buffer.getvalue())
When working with a BytesIO object, pay careful attention to the order of operations. In your code, you instantiate the BytesIO object and then fill it via a call to to_csv(). So far so good. But one thing to manage when working with a BytesIO object that is different from a file workflow is the stream position.
After writing data to the stream, the stream position is at the end of the stream. If you try to write from that position, you will likely write nothing! The operation will complete leaving you scratching your head why no results are written to S3. Add a call to seek() with the argument 0 to your function. Here is a demo program that demonstrates:
from io import BytesIO
import boto3
import pandas
from pandas import util
df = util.testing.makeMixedDataFrame()
s3_resource = boto3.resource("s3")
buffer = BytesIO()
df.to_csv(buffer, sep=",", index=False, mode="wb", encoding="UTF-8")
# The following call to `tell()` returns the stream position. 0 is the beginning of the file.
df.tell()
>> 134
# Reposition stream to the beginning by calling `seek(0)` before uploading
df.seek(0)
s3_r.Object("test-bucket", "test_df_from_resource.csv").put(Body=buffer.getvalue())
You should get a response similar to the following (with actual values)
>> {'ResponseMetadata': {'RequestId': 'request-id-value',
'HostId': '###########',
'HTTPStatusCode': 200,
'HTTPHeaders': {'x-amz-id-2': '############',
'x-amz-request-id': '00000',
'date': 'Tue, 31 Aug 2021 00:00:00 GMT',
'x-amz-server-side-encryption': 'value',
'etag': '"xxxx"',
'server': 'AmazonS3',
'content-length': '0'},
'RetryAttempts': 0},
'ETag': '"xxxx"',
'ServerSideEncryption': 'value'}
Changing the code to move the stream position should solve the issues you were facing. It is also worth mentioning, Pandas had a bug that caused unexpected behavior when writing to a bytes object. It was fixed and the sample I provided assumes you are running a version of Python greater than 3.8 and a version of Pandas greater than 1.3.2. Further information on IO can be found in the python documentation.