How to use aioboto3 & asyncio to download file from S3 aws - Python - python-3.x

I have the sync script which is running & working well, but i see some download files takes time, thought of using async approach here.
import json
import os
import io
import time
import gzip
import re
import logging
from logging.handlers import RotatingFileHandler
import boto3
AWS_KEY = "**"
AWS_SECRET = "**"
QUEUE_URL = "***"
OUTPUT_PATH = "./test"
VISIBILITY_TIMEOUT = 10
REGION_NAME = "region"
sqs = boto3.resource('sqs', region_name=REGION_NAME, aws_access_key_id=AWS_KEY, aws_secret_access_key=AWS_SECRET)
s3 = boto3.client('s3', region_name=REGION_NAME, aws_access_key_id=AWS_KEY, aws_secret_access_key=AWS_SECRET)
queue = sqs.Queue(url=QUEUE_URL)
def handle_response(msg, path):
"""Logic goes here"""
print('message: %s' % msg)
def download_message_files(msg):
for s3_file in msg['files']:
s3_path = s3_file['path']
with io.BytesIO() as f:
s3.download_fileobj(msg['bucket'], s3_path, f)
f.seek(0)
for line in gzip.GzipFile(fileobj=f):
await handle_response(line.decode('UTF-8'), s3_path)
def consume():
while True:
for msg in queue.receive_messages(VisibilityTimeout=VISIBILITY_TIMEOUT):
body = json.loads(msg.body) # grab the actual message body
download_message_files(body)
msg.delete()
time.sleep(sleep_time)
if __name__ == '__main__':
# Setup our root logger
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)s %(levelname)s %(message)s")
# Create our FDR logger
logger = logging.getLogger("Consumer")
# Rotate log file handler
RFH = RotatingFileHandler("test.log", maxBytes=20971520, backupCount=5)
# Log file output format
F_FORMAT = logging.Formatter('%(asctime)s %(name)s %(levelname)s %(message)s')
# Set the log file output level to INFO
RFH.setLevel(logging.INFO)
# Add our log file formatter to the log file handler
RFH.setFormatter(F_FORMAT)
# Add our log file handler to our logger
logger.addHandler(RFH)
consume()
I have tried converting this using aioboto3 and got struck in queue approach.
session = aioboto3.Session()
sqs = session.resource('sqs', region_name=REGION_NAME, aws_access_key_id=AWS_KEY, aws_secret_access_key=AWS_SECRET)
s3 = session.client('s3', region_name=REGION_NAME, aws_access_key_id=AWS_KEY, aws_secret_access_key=AWS_SECRET)
queue = sqs.Queue(url=QUEUE_URL) <---- this gives error as 'ResourceCreatorContext' object has no attribute 'Queue'
As i could understand from this there is no attribute, but could anyone guide me to make this working with async nature.

You can use asyncio and aioboto3 together.
Instead of creating a resource, you can use client. The difference between an aioboto3.client and aioboto3.resource can be found in this answer.
This is a simple working example:
import aioboto3
async def consume():
async with aioboto3.Session().client(service_name='sqs', region_name=REGION_NAME, aws_access_key_id=AWS_KEY, aws_secret_access_key=AWS_SECRET) as client:
messages = await (client.receive_messages(VisibilityTimeout=VISIBILITY_TIMEOUT)
for message in messages:
# Do something
This should solve the error you are facing. This solution can also be extended to S3 as per your requirements.

Related

Sending Log Data to Splunk using Python

I have an app that detects file changes, backups, and syncs files to Azure.
I currently have a logger setup writes log events to a file called log.log. I also have event data streaming to stdout. This is my current working code.
I’d like to send log data to Splunk via requests.post() or logging.handlers.HTTPHandler.
Question: How do I set up an HTTP Handler in Python logging?
(I need to become more familiar with the advanced features of logging in Python.)
import logging
def setup_logger(logger_name:str=__name__, logfile:str='log.log'):
""" Standard Logging: std out and log file.
Args:
logger_name (str, optional): Logger Name. Defaults to __name__.
logfile (str, optional): Log File Name. Defaults to 'app.log'.
Returns:
logging: function to set logging as a object.
"""
logger = logging.getLogger(logger_name)
logger.setLevel(logging.INFO)
fh = logging.FileHandler(logfile)
fh.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter(
'%(asctime)s | %(name)s | %(levelname)s | %(message)s',
'%m-%d-%Y %H:%M:%S')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
logger.addHandler(fh)
logger.addHandler(ch)
return logger
if __name__ == "__main__":
logger=setup_logger('logger','log-sample.log') # Creates a test file vs default log.log
logger.info("My Logger has been initialized")
Currently I’m trying to send test data to Splunk via this code example (before I figure out the logging issue):
import requests
# Set up the Splunk HEC URL and token
splunk_url = "http://127.0.0.1:8088/services/collector/event"
splunk_token = "57489f00-605e-4f2a-8df3-123456789abcdef="
# Set up the log event data
log_data = {
"event": "This is a test log event",
"sourcetype": "my_sourcetype",
"index": "test_index"
}
# Send the log event to Splunk
response = requests.post(splunk_url, json=log_data, headers={
"Authorization": f"Splunk {splunk_token}"
})
# Check the response status code to make sure the request was successful
if response.status_code == 200:
print("Log event sent to Splunk successfully")
else:
print(f"Error sending log event to Splunk: {response.text}")
I found the solution myself.
import logging
import requests
import urllib3
urllib3.disable_warnings() # using default cert.
url = "https://127.0.0.1:8088/services/collector/event"
headers = {"Authorization": "Splunk 09584dbe-183b-4d14-9ee9-be66a37b331a"}
index = 'test_index'
class CustomHttpHandler(logging.Handler):
def __init__(self, url:str, headers:dict, index:str) -> None:
self.url = url
self.headers = headers
self.index = index
super().__init__()
def emit(self, record:str) -> exec:
'''
This function gets called when a log event gets emitted. It receives a
record, formats it and sends it to the url
Parameters:
record: a log record (created by logging module)
'''
log_entry = self.format(record)
response = requests.post(
url=self.url, headers=self.headers,
json={"index": self.index, "event": log_entry},
verify=False)
def setup_logger(logger_name:str=__name__, logfile:str='log.log'):
""" Standard Logging: std out and log file.
1.creates file handler which logs even debug messages: fh
2.creates console handler with a higher log level: ch
3.creates formatter and add it to the handlers: formatter, setFormatter
4.adds the handlers to the logger: addHandler
Args:
logger_name (str, optional): Logger Name. Defaults to __name__.
logfile (str, optional): Log File Name. Defaults to 'app.log'.
Returns:
logging: function to set logging as a object.
"""
logger = logging.getLogger(logger_name)
logger.setLevel(logging.INFO)
fh = logging.FileHandler(logfile)
fh.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter(
'%(asctime)s | %(name)s | %(levelname)s | %(message)s',
'%m-%d-%Y %H:%M:%S')
splunk_handler = CustomHttpHandler(url=url, headers=headers, index=index)
fh.setFormatter(formatter)
ch.setFormatter(formatter)
splunk_handler.setFormatter(formatter)
logger.addHandler(fh)
logger.addHandler(ch)
logger.addHandler(splunk_handler)
return logger
if __name__ == "__main__":
logger=setup_logger('logger','app.log')
logger.info("My Logger has been initialized")

Not able to read from Tweets from twitter

I am trying to read tweets having specific keywords using docker. I have taken reference from
Github link .
I have made some minor changes. While I'm trying to execute I am facing issues with a number of arguments through all the details in place. It would be great if anybody can guide me where I'm doing wrong
### twitter
import tweepy
from tweepy.auth import OAuthHandler
from tweepy import Stream
#from tweepy.streaming import StreamListener
import json
import logging
### logging
FORMAT = "%(asctime)s | %(name)s - %(levelname)s - %(message)s"
LOG_FILEPATH = "C:\\docker-kafka\\log\\testing.log"
logging.basicConfig(
filename=LOG_FILEPATH,
level=logging.INFO,
filemode='w',
format=FORMAT)
### Authenticate to Twitter
with open('C:\\docker-kafka\\credential.json','r') as f:
credential = json.load(f)
CONSUMER_KEY = credential['twitter_api_key']
CONSUMER_SECRET = credential['twitter_api_secret_key']
ACCESS_TOKEN = credential['twitter_access_token']
ACCESS_TOKEN_SECRET = credential['twitter_access_token_secret']
BEARER_TOKEN = credential['bearer_token']
#from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from kafka import KafkaProducer
producer = KafkaProducer(bootstrap_servers='localhost:9092',
value_serializer=lambda v: v.encode('utf-8')) #Same port as your Kafka server
topic_name = "docker-twitter"
class twitterAuth():
"""SET UP TWITTER AUTHENTICATION"""
def authenticateTwitterApp(self):
auth = OAuthHandler(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
return auth
class TwitterStreamer():
"""SET UP STREAMER"""
def __init__(self):
self.twitterAuth = twitterAuth()
def stream_tweets(self):
while True:
listener = ListenerTS()
auth = self.twitterAuth.authenticateTwitterApp()
stream = Stream(auth, listener)
stream.filter(track=["Starbucks"], stall_warnings=True, languages= ["en"])
class ListenerTS(tweepy.Stream):
def on_status(self, status):
tweet = json.dumps({
'id': status.id,
'text': status.text,
'created_at': status.created_at.strftime("%Y-%m-%d %H:%M:%S")
}, default=str)
producer.send(topic_name, tweet)
return True
if __name__ == "__main__":
TS = TwitterStreamer()
TS.stream_tweets()
Answer reference:
Not able to read from Tweets from twitter
As far as i understand, the class tweepy.Stream needs to be initialized, even if inheriting it. So, instead of:
try
class ListenerTS(tweepy.Stream):
def __init__(self):
tweepy.Stream.__init__(self, CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
Also check this docs:
https://docs.tweepy.org/en/stable/streaming.html
And maybe this link:
https://improveandrepeat.com/2022/04/python-friday-117-streaming-search-results-with-tweepy/
I was able to resolve the issue by adding the secrets but got a different error.
def stream_tweets(self):
while True:
listener = ListenerTS(CONSUMER_KEY,CONSUMER_SECRET,ACCESS_TOKEN,ACCESS_TOKEN_SECRET)
listener.filter(track=["Starbucks"], stall_warnings=True, languages= ["en"])

EventHub and Receive

All,
I modified the sample Receive python script for Azure EventHub a bit but when I run it goes into a loop fetching the same events over and over. I'm not sending any events to the eventhub since I want to read what is there and I dont see a while loop here so how is this happening and how do I stop after it reads all the events currently in the EventHub?
Thanks
grajee
# https://learn.microsoft.com/en-us/python/api/overview/azure/eventhub-readme?view=azure-python#consume-events-from-an-event-hub
import logging
from azure.eventhub import EventHubConsumerClient
connection_str = 'Endpoint=sb://testhubns01.servicebus.windows.net/;SharedAccessKeyName=getevents;SharedAccessKey=testtestest='
consumer_group = '$Default'
eventhub_name = 'testpart'
client = EventHubConsumerClient.from_connection_string(connection_str, consumer_group, eventhub_name=eventhub_name)
logger = logging.getLogger("azure.eventhub")
logging.basicConfig(level=logging.INFO)
def on_event(partition_context, event):
logger.info("Received event from partition: \"{}\" : \"{}\"" .format(partition_context.partition_id,event.body_as_str()))
partition_context.update_checkpoint(event)
with client:
client.receive(
on_event=on_event,
starting_position="-1", # "-1" is from the beginning of the partition.
)
# receive events from specified partition:
# client.receive(on_event=on_event, partition_id='0')
client.close()
The below piece of code from here makes it more clear .
import asyncio
from azure.eventhub.aio import EventHubConsumerClient
from azure.eventhub.extensions.checkpointstoreblobaio import BlobCheckpointStore
connection_str = '<< CONNECTION STRING FOR THE EVENT HUBS NAMESPACE >>'
consumer_group = '<< CONSUMER GROUP >>'
eventhub_name = '<< NAME OF THE EVENT HUB >>'
storage_connection_str = '<< CONNECTION STRING FOR THE STORAGE >>'
container_name = '<<NAME OF THE BLOB CONTAINER>>'
async def on_event(partition_context, event):
# do something
await partition_context.update_checkpoint(event) # Or update_checkpoint every N events for better performance.
async def receive(client):
await client.receive(
on_event=on_event,
starting_position="-1", # "-1" is from the beginning of the partition.
)
async def main():
checkpoint_store = BlobCheckpointStore.from_connection_string(storage_connection_str, container_name)
client = EventHubConsumerClient.from_connection_string(
connection_str,
consumer_group,
eventhub_name=eventhub_name,
**checkpoint_store=checkpoint_store, # For load balancing and checkpoint. Leave None for no load balancing**
)
async with client:
await receive(client)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

How to publish and subscribe .pdf file in Google Pub/Sub (GCP)

In the below code the big .pdf file is split into single pages and uploaded into bucket and enqueued to pubsub simultaneously
def publish_messages(project_id, topic_id, enqueue_file):
publisher = pubsub_v1.PublisherClient()
topic_path = publisher.topic_path(project_id, topic_id)
data = enqueue_file
# Data must be a bytestring
data = data.encode("utf-8")
# When you publish a message, the client returns a future.
future = publisher.publish(topic_path, data=data)
print(future.result())
print(enqueue_file + "has been enqueued to Pub/Sub.")
def upload_local_directory_to_gcs(local_path, bucket, gcs_path):
assert os.path.isdir(local_path)
for local_file in glob.glob(local_path + '/**'):
if not os.path.isfile(local_file):
continue
remote_path = os.path.join(gcs_path, local_file[1 + len(local_path) :])
storage_client = storage.Client()
buck = storage_client.bucket(bucket)
blob = buck.blob(remote_path)
blob.upload_from_filename(local_file)
print("Uploaded " + local_file + " to gs bucket " + bucket)
publish_messages("Project1", "my-topic", local_file)
I receive messages using the below code
def receive_messages(project_id, subscription_id , timeout=None):
from concurrent.futures import TimeoutError
from google.cloud import pubsub_v1
subscriber = pubsub_v1.SubscriberClient()
subscription_path = subscriber.subscription_path(project_id, subscription_id)
def callback(message):
print("Received message: {}".format(message))
message.ack()
streaming_pull_future = subscriber.subscribe(subscription_path, callback=callback)
print("Listening for messages on {}..\n".format(subscription_path))
with subscriber:
try:
streaming_pull_future.result(timeout=timeout)
except TimeoutError:
streaming_pull_future.cancel()
if __name__ == "__main__":
receive_messages("Project1", "my-sub")
But when I receive I am getting just string data.
Received message: Message {
data: b'/tmp/doc_pages/document-page17.pdf'
ordering_key: ''
attributes: {}
}
My idea is to get that pdf file and perform some OCR operation using Vision API. Is it possible to get pdf file itself? Is there any other methodology please let me know.
Thanks!

Mocking boto3 Cloudwatch Log client

A Cloudwatch log is an object with Log Group > Log Stream > Log Events on AWS. I am trying to write tests for this, but the moto mocking raises a client error when applied to boto3.client('logs'). I am looking at other ways to mock the behavior of the log. How would you write a test for this function?
For example:
client = boto3.client('logs')
def get_recent_log_stream_name(logGroupName):
response = client.describe_log_streams(
logGroupName=logGroupName,
orderBy='LastEventTime',
descending=True,
limit=1)
logStreamName = response['logStreams'][0]['logStreamName']
return logStreamName
I would write the test using moto like this:
import boto3
from moto import mock_logs
def get_recent_log_stream_name(logs, logGroupName):
"""Function under test"""
response = logs.describe_log_streams(
logGroupName=logGroupName,
orderBy='LastEventTime',
descending=True,
limit=1)
log_stream_name = response['logStreams'][0]['logStreamName']
return log_stream_name
#mock_logs
def test_get_recent_log_stream_name():
"""Test function"""
log_group_name = 'test-group'
log_stream_name = 'test-stream'
logs = boto3.client('logs')
logs.create_log_group(logGroupName=log_group_name)
logs.create_log_stream(
logGroupName=log_group_name,
logStreamName=log_stream_name,
)
assert get_recent_log_stream_name(logs, log_group_name) == log_stream_name
test_get_recent_log_stream_name()

Resources