How to publish and subscribe .pdf file in Google Pub/Sub (GCP) - python-3.x

In the below code the big .pdf file is split into single pages and uploaded into bucket and enqueued to pubsub simultaneously
def publish_messages(project_id, topic_id, enqueue_file):
publisher = pubsub_v1.PublisherClient()
topic_path = publisher.topic_path(project_id, topic_id)
data = enqueue_file
# Data must be a bytestring
data = data.encode("utf-8")
# When you publish a message, the client returns a future.
future = publisher.publish(topic_path, data=data)
print(future.result())
print(enqueue_file + "has been enqueued to Pub/Sub.")
def upload_local_directory_to_gcs(local_path, bucket, gcs_path):
assert os.path.isdir(local_path)
for local_file in glob.glob(local_path + '/**'):
if not os.path.isfile(local_file):
continue
remote_path = os.path.join(gcs_path, local_file[1 + len(local_path) :])
storage_client = storage.Client()
buck = storage_client.bucket(bucket)
blob = buck.blob(remote_path)
blob.upload_from_filename(local_file)
print("Uploaded " + local_file + " to gs bucket " + bucket)
publish_messages("Project1", "my-topic", local_file)
I receive messages using the below code
def receive_messages(project_id, subscription_id , timeout=None):
from concurrent.futures import TimeoutError
from google.cloud import pubsub_v1
subscriber = pubsub_v1.SubscriberClient()
subscription_path = subscriber.subscription_path(project_id, subscription_id)
def callback(message):
print("Received message: {}".format(message))
message.ack()
streaming_pull_future = subscriber.subscribe(subscription_path, callback=callback)
print("Listening for messages on {}..\n".format(subscription_path))
with subscriber:
try:
streaming_pull_future.result(timeout=timeout)
except TimeoutError:
streaming_pull_future.cancel()
if __name__ == "__main__":
receive_messages("Project1", "my-sub")
But when I receive I am getting just string data.
Received message: Message {
data: b'/tmp/doc_pages/document-page17.pdf'
ordering_key: ''
attributes: {}
}
My idea is to get that pdf file and perform some OCR operation using Vision API. Is it possible to get pdf file itself? Is there any other methodology please let me know.
Thanks!

Related

How to use aioboto3 & asyncio to download file from S3 aws - Python

I have the sync script which is running & working well, but i see some download files takes time, thought of using async approach here.
import json
import os
import io
import time
import gzip
import re
import logging
from logging.handlers import RotatingFileHandler
import boto3
AWS_KEY = "**"
AWS_SECRET = "**"
QUEUE_URL = "***"
OUTPUT_PATH = "./test"
VISIBILITY_TIMEOUT = 10
REGION_NAME = "region"
sqs = boto3.resource('sqs', region_name=REGION_NAME, aws_access_key_id=AWS_KEY, aws_secret_access_key=AWS_SECRET)
s3 = boto3.client('s3', region_name=REGION_NAME, aws_access_key_id=AWS_KEY, aws_secret_access_key=AWS_SECRET)
queue = sqs.Queue(url=QUEUE_URL)
def handle_response(msg, path):
"""Logic goes here"""
print('message: %s' % msg)
def download_message_files(msg):
for s3_file in msg['files']:
s3_path = s3_file['path']
with io.BytesIO() as f:
s3.download_fileobj(msg['bucket'], s3_path, f)
f.seek(0)
for line in gzip.GzipFile(fileobj=f):
await handle_response(line.decode('UTF-8'), s3_path)
def consume():
while True:
for msg in queue.receive_messages(VisibilityTimeout=VISIBILITY_TIMEOUT):
body = json.loads(msg.body) # grab the actual message body
download_message_files(body)
msg.delete()
time.sleep(sleep_time)
if __name__ == '__main__':
# Setup our root logger
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)s %(levelname)s %(message)s")
# Create our FDR logger
logger = logging.getLogger("Consumer")
# Rotate log file handler
RFH = RotatingFileHandler("test.log", maxBytes=20971520, backupCount=5)
# Log file output format
F_FORMAT = logging.Formatter('%(asctime)s %(name)s %(levelname)s %(message)s')
# Set the log file output level to INFO
RFH.setLevel(logging.INFO)
# Add our log file formatter to the log file handler
RFH.setFormatter(F_FORMAT)
# Add our log file handler to our logger
logger.addHandler(RFH)
consume()
I have tried converting this using aioboto3 and got struck in queue approach.
session = aioboto3.Session()
sqs = session.resource('sqs', region_name=REGION_NAME, aws_access_key_id=AWS_KEY, aws_secret_access_key=AWS_SECRET)
s3 = session.client('s3', region_name=REGION_NAME, aws_access_key_id=AWS_KEY, aws_secret_access_key=AWS_SECRET)
queue = sqs.Queue(url=QUEUE_URL) <---- this gives error as 'ResourceCreatorContext' object has no attribute 'Queue'
As i could understand from this there is no attribute, but could anyone guide me to make this working with async nature.
You can use asyncio and aioboto3 together.
Instead of creating a resource, you can use client. The difference between an aioboto3.client and aioboto3.resource can be found in this answer.
This is a simple working example:
import aioboto3
async def consume():
async with aioboto3.Session().client(service_name='sqs', region_name=REGION_NAME, aws_access_key_id=AWS_KEY, aws_secret_access_key=AWS_SECRET) as client:
messages = await (client.receive_messages(VisibilityTimeout=VISIBILITY_TIMEOUT)
for message in messages:
# Do something
This should solve the error you are facing. This solution can also be extended to S3 as per your requirements.

EventHub and Receive

All,
I modified the sample Receive python script for Azure EventHub a bit but when I run it goes into a loop fetching the same events over and over. I'm not sending any events to the eventhub since I want to read what is there and I dont see a while loop here so how is this happening and how do I stop after it reads all the events currently in the EventHub?
Thanks
grajee
# https://learn.microsoft.com/en-us/python/api/overview/azure/eventhub-readme?view=azure-python#consume-events-from-an-event-hub
import logging
from azure.eventhub import EventHubConsumerClient
connection_str = 'Endpoint=sb://testhubns01.servicebus.windows.net/;SharedAccessKeyName=getevents;SharedAccessKey=testtestest='
consumer_group = '$Default'
eventhub_name = 'testpart'
client = EventHubConsumerClient.from_connection_string(connection_str, consumer_group, eventhub_name=eventhub_name)
logger = logging.getLogger("azure.eventhub")
logging.basicConfig(level=logging.INFO)
def on_event(partition_context, event):
logger.info("Received event from partition: \"{}\" : \"{}\"" .format(partition_context.partition_id,event.body_as_str()))
partition_context.update_checkpoint(event)
with client:
client.receive(
on_event=on_event,
starting_position="-1", # "-1" is from the beginning of the partition.
)
# receive events from specified partition:
# client.receive(on_event=on_event, partition_id='0')
client.close()
The below piece of code from here makes it more clear .
import asyncio
from azure.eventhub.aio import EventHubConsumerClient
from azure.eventhub.extensions.checkpointstoreblobaio import BlobCheckpointStore
connection_str = '<< CONNECTION STRING FOR THE EVENT HUBS NAMESPACE >>'
consumer_group = '<< CONSUMER GROUP >>'
eventhub_name = '<< NAME OF THE EVENT HUB >>'
storage_connection_str = '<< CONNECTION STRING FOR THE STORAGE >>'
container_name = '<<NAME OF THE BLOB CONTAINER>>'
async def on_event(partition_context, event):
# do something
await partition_context.update_checkpoint(event) # Or update_checkpoint every N events for better performance.
async def receive(client):
await client.receive(
on_event=on_event,
starting_position="-1", # "-1" is from the beginning of the partition.
)
async def main():
checkpoint_store = BlobCheckpointStore.from_connection_string(storage_connection_str, container_name)
client = EventHubConsumerClient.from_connection_string(
connection_str,
consumer_group,
eventhub_name=eventhub_name,
**checkpoint_store=checkpoint_store, # For load balancing and checkpoint. Leave None for no load balancing**
)
async with client:
await receive(client)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

Uploading images to a Flask API using FlaskClient

I have the following endpoint in my Flask API:
#api.route("/files", methods=Method.POST)
def upload_file() -> str:
enquirer_id = request.get_json()['id']
try:
logging.info(request.files)
uploaded_file = request.files['image']
except KeyError:
abort(HttpStatusCode.BAD_REQUEST, "Please provide image to upload")
if uploaded_file.content_type != ContentType.JPEG:
abort(HttpStatusCode.BAD_REQUEST, "Only JPEG images are allowed")
filename = str(enquirer_id)
destination_file = os.path.join(current_app.config[AppCfg.DIRECTORY], filename)
uploaded_file.save(destination_file)
return f'Successfully upload file {filename}.'
But I am unable to test this endpoint using the FlaskClient:
class APITestUploadDownload(APITestBase):
url = '/api/v1/files'
my_file = 'some_picture.jpg'
def setUp(self):
self.client = self.app.test_client()
def test_upload(self):
with open(file=self.my_file, mode='rb') as file:
data = {
'image': io.BytesIO(file.read()),
'id': 1
}
response = self.client.post(self.url, data=json.dumps(data), headers={}, content_type='multipart/form-data',)
self.assertEqual(response.status_code, HttpStatusCode.OK)
But this fails with the error message TypeError: Object of type BytesIO is not JSON serializable.
If i make the request without using json.dump(data) using response = self.client.post(self.url, data=data, headers={}, content_type='multipart/form-data', ) I get another error which tells me thatresponse.data is None.
So how can I fix my file upload endpoint and the correspondig test?

OSError: Errno 30 Read-only file system: Any way to create a CSV and attach it to email from Python in AWS Lambda?

So I am trying to create a Lambda Function to Query from MySQL Database and write the query output to a CSV file, attach the created file as an attachment and send an email using Python.
I have a working code to achieve this, but unable to execute the same in AWS Lambda.
Here's the code I'm working on right now:
engine = create_engine("mysql+pymysql://username:password#MYSQL DB Creds")
con_mysql = engine.connect()
#dump_attachment_query_link
attach = pd.read_sql("SELECT * FROM some_table)",con_mysql)
#NAMING_FILES
#start,yest are dates
def filenames(start, yest):
if start == yest:
return "Dump_{}.csv".format(yest)
else:
return "Dump_{}_to_{}.csv".format(start, yest)
attach.reset_index(drop = True, inplace = True)
att = attach.to_csv(filenames(start, yest))
files = filenames(start, yest)
def send_mail(fromaddr, subject, message):
access_token, expires_in = refresh_authorization(GOOGLE_CLIENT_ID, GOOGLE_CLIENT_SECRET, GOOGLE_REFRESH_TOKEN)
auth_string = generate_oauth2_string(fromaddr, access_token, as_base64=True)
msg = MIMEMultipart('related')
msg['Subject'] = subject + ": %s" %yest
msg['From'] = fromaddr
msg['To'] = "receivers'mail"
msg.preamble = 'This is a multi-part message in MIME format.'
msg_alternative = MIMEMultipart('alternative')
msg.attach(msg_alternative)
part_text = MIMEText(lxml.html.fromstring(message).text_content().encode('utf-8'), 'plain', _charset='utf-8')
part_html = MIMEText(message.encode('utf-8'), 'html', _charset='utf-8')
msg_alternative.attach(part_text)
msg_alternative.attach(part_html)
part = MIMEBase('application', "octet-stream")
part.set_payload(open(files, "rb").read())
encoders.encode_base64(part)
part.add_header('Content-Disposition', 'attachment; filename = {}'.format(files))
msg.attach(part)
server = smtplib.SMTP('smtp.gmail.com:587')
server.ehlo(GOOGLE_CLIENT_ID)
server.starttls()
server.docmd('AUTH', 'XOAUTH2 ' + auth_string)
server.sendmail(fromaddr,msg['To'].split(",") + msg['Cc'].split(","), msg.as_string())
server.quit()
When I use the code I get the following error - [ERROR] OSError: [Errno 30] Read-only file system: 'Dump_{}to{}.csv'
I'm quite new to Lambda and Python.
Any help regarding this is appreciated
Error says you cannot write to file system. According to code you try to create CSV files there.
You don't need to create real files to create an attachment. In memory streams are enough.
Your Lambda is allowed to write to /tmp directory.

Easy integration of chatbot with slack-app

I have a ChatBot application running, just want to hook this application with Slack-api as it's interface.
I used Slack RTM and maintained user-session with its slack user-id.
finally solved and written a client(API) which can easily connect to any conversation engine.
Github repo link-
https://github.com/csemanmohan/Slack_api_client
import time
import re
from slackclient import SlackClient
import requests
# 'url', chatbot endpoint and 'slack_token' is slack application user-access-token
url = "http://127.0.0.1:****/*******/v2/api"
slack_token = "xoxb-**********-***********-*************lipO8hoI"
# instantiate Slack client
slack_client = SlackClient(slack_token)
# starterbot's user ID in Slack: value is assigned after the bot starts up
starterbot_id = None
# constants
RTM_READ_DELAY = 1 # 1 second delay between reading from RTM
EXAMPLE_COMMAND = "do"
MENTION_REGEX = "^<#(|[WU].+?)>(.*)"
def parse_bot_commands(slack_events):
"""
Parses a list of events coming from the Slack RTM API to find bot commands.
If a bot command is found, this function returns a tuple of command and channel.
If its not found, then this function returns None, None.
"""
# below var msg and channel_var will be used/
# when no trigger(#app-name) passed from application
msg = ""
channel_def = ""
for event in slack_events:
if event["type"] == "message" and not "subtype" in event:
msg = event["text"]
channel_def = event["channel"]
user_id, message = parse_direct_mention(event["text"])
print("there is an event here...", user_id, message)
if user_id == starterbot_id:
return message, event["channel"]
channel_def = channel_def
return msg, channel_def
def parse_direct_mention(message_text):
"""
Finds a direct mention (a mention that is at the beginning) in message text
and returns the user ID which was mentioned. If there is no direct mention, returns None
"""
matches = re.search(MENTION_REGEX, message_text)
# the first group contains the username, the second group contains the remaining message
return (matches.group(1), matches.group(2).strip()) if matches else (None, None)
def handle_command(command, channel):
"""
Executes bot command if the command is known
"""
# Default response is help text for the user
default_response = "Not sure what you mean. Try *{}*.".format(EXAMPLE_COMMAND)
# Implemented below code-snippet for making API call to ChatBot
input_text = command
payload = {"text": input_text, "email": "manmohan#m******.com"}
headers = {'content-type': "application/json"}
resp = requests.request("POST", url, json=payload, headers=headers)
result = eval(resp.json())
print("result is: ", result)
response = result['text']
# Sends the response back to the channel
slack_client.api_call(
"chat.postMessage",
channel=channel,
text=response or default_response
)
if __name__ == "__main__":
if slack_client.rtm_connect(with_team_state=False):
print("Starter Bot connected and running!")
# Read bot's user ID by calling Web API method `auth.test`
starterbot_id = slack_client.api_call("auth.test")["user_id"]
while True:
command, channel = parse_bot_commands(slack_client.rtm_read())
if command:
handle_command(command, channel)
time.sleep(RTM_READ_DELAY)
else:
print("Connection failed. Exception traceback printed above.")

Resources