import json
import base64
from google.cloud import bigquery
import ast
import pandas as pd
import sys
import pandas_gbq
def process_data(data):
#msg = str(data)
df = pd.DataFrame({"Data":data},index=[0])
df['time'] = pd.datetime.now()
lst = list(df)
df[lst] = df[lst].astype(str)
pandas_gbq.to_gbq(df,'datasetid.tableid',project_id='project_id',if_exists='append')
def receive_messages(project_id, subscription_name):
"""Receives messages from a pull subscription."""
# [START pubsub_subscriber_async_pull]
# [START pubsub_quickstart_subscriber]
import time
from google.cloud import pubsub_v1
# TODO project_id = "Your Google Cloud Project ID"
# TODO subscription_name = "Your Pub/Sub subscription name"
subscriber = pubsub_v1.SubscriberClient()
# The `subscription_path` method creates a fully qualified identifier
# in the form `projects/{project_id}/subscriptions/{subscription_name}`
subscription_path = subscriber.subscription_path(
project_id, subscription_name)
def callback(message):
#print('Received message: {}'.format(message))
process_data(message)
message.ack()
subscriber.subscribe(subscription_path, callback=callback)
# The subscriber is non-blocking. We must keep the main thread from
# exiting to allow it to process messages asynchronously in the background.
# print('Listening for messages on {}'.format(subscription_path))
while True:
time.sleep(60)
# [END pubsub_subscriber_async_pull]
# [END pubsub_quickstart_subscriber]
receive_messages(project-id,sub-id)
I'm streaming the real time data from Pub/Sub to bigquery using cloud functions.
Here the following error:
Deployment failure:
Function failed on loading user code. Error message: Error: function load attempt timed out.
Your code is in a while True loop. Cloud Functions considers that your code has crashed because it does not return. Then your function is killed.
Redesign so that Pub/Sub is calling your Cloud Function using events (triggers). Follow this guide on how to implement a correct design:
Google Cloud Pub/Sub Triggers
Related
I've created an Azure Function that fetches data from an External API and sends that data to my cosomos container using a timer trigger. This function is not sending any data to the cosmos db and I can't figure out why.
I've removed some lines of code which contained sensitive information, but I have tested the code locally and it is fetching the data and sending to cosmos if I just run the file.
here is my function:
import datetime
import logging
import azure.functions as func
import time
import pandas as pd
import requests
import json
import uuid
from azure.cosmos.aio import CosmosClient
import asyncio
def get_data():
url = 'externalAPIUrl'
session = requests.Session()
request = session.get(url, headers=headers)
cookies = dict(request.cookies)
response = session.get(url, headers=headers, cookies=cookies).json()
rawData = pd.DataFrame(response)
rawop = pd.DataFrame(rawData['filtered']['data'])
rawop = rawop.set_index('paramter')
processed_data = []
for i in rawop.index:
#creating json to send
processed_data.append(processed_ce)
processed_data.append(processed_pe)
logging.info("Data processed successfully")
return processed_data
async def manage_cosmos(processed_data):
print(len(processed_data))
cosmosdb_endpoint = 'endpointURL'
cosmos_key = 'key'
DATABASE_NAME = 'dbname'
CONTAINER_NAME = 'containername'
async with CosmosClient(cosmosdb_endpoint, cosmos_key) as client:
database = client.get_database_client(DATABASE_NAME)
container = database.get_container_client(CONTAINER_NAME)
# send all the objects in the processed_Data list to cosmos asyncronously
tasks = []
for i in processed_data[:40]:
tasks.append(container.create_item(i))
await asyncio.gather(*tasks)
tasks = []
for i in processed_data[40:90]:
tasks.append(container.create_item(i))
await asyncio.gather(*tasks)
tasks = []
for i in processed_data[90:]:
tasks.append(container.create_item(i))
await asyncio.gather(*tasks)
logging.info("Data sent to cosmos successfully")
def main(mytimer: func.TimerRequest) -> None:
if mytimer.past_due:
logging.info('The timer is past due!')
try:
processed_data = get_data()
asyncio.run(manage_cosmos(processed_data))
except Exception as e:
logging.error("error in function", e)
here is my function.json
{
"scriptFile": "__init__.py",
"bindings": [
{
"name": "mytimer",
"type": "timerTrigger",
"direction": "in",
"schedule": "0 */2 * * * *"
}
]
}
this is my requirements.txt
# DO NOT include azure-functions-worker in this file
# The Python Worker is managed by Azure Functions platform
# Manually managing azure-functions-worker may cause unexpected issues
azure-functions
pandas
asyncio
aiohttp
requests
uuid
azure-cosmos
Can somebody tell me why it is not sending data to cosmos?
I deployed the above function and no data was added to my cosmos container. I ran the same code without the azure-function part locally and I was able to send new items to my cosmos container.
I am most probably making a mistake in creating the function so if anybody can point that out.
I am trying to run spark-nlp as azure function.
I have a function app which is run with a docker container. My function app code is run on python and I also install java as I run pyspark within it. I use python's flask within one function to handle incoming requests.
Once the function app starts and container is running, for the first few seconds I get responses for my API calls but after only few seconds (~15-20 seconds) the API calls start timing out due to no response from the server.
The function app is running on dedicated app service plan and is set to 'always on'.
What is the reason for such a behavior?
Here is my function app code:
import logging
import azure.functions as func
# Imports for Spark-NLP
import os
import sys
sys.path.append('/home/site/wwwroot/contextSpellCheck/spark-2.4.7-bin-hadoop2.7/python')
sys.path.append('/home/site/wwwroot/contextSpellCheck/spark-2.4.7-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip')
import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from sparknlp.annotator import *
from flask import Flask, request
app = Flask(__name__)
spark = sparknlp.start()
documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
tokenizer = RecursiveTokenizer().setInputCols(["document"]).setOutputCol("token").setPrefixes(["\"", "(", "[", "\n"]).setSuffixes([".", ",", "?", ")", "!", "'s"])
spellModel = ContextSpellCheckerModel.load("/home/site/wwwroot/contextSpellCheck/spellcheck_dl_en_2.5.0_2.4_1588756259065").setInputCols("token").setOutputCol("checked")
finisher = Finisher().setInputCols("checked")
pipeline = Pipeline(stages=[documentAssembler, tokenizer, spellModel, finisher])
empty_ds = spark.createDataFrame([[""]]).toDF("text")
lp = LightPipeline(pipeline.fit(empty_ds))
#app.route('/api/testFunction', methods = ['GET', 'POST'])
def annotate():
global lp
if request.method == 'GET':
text = request.args.get('text')
elif request.method == 'POST':
req_body = request.get_json()
text = req_body['text']
return lp.annotate(text)
def main(req: func.HttpRequest, context: func.Context) -> func.HttpResponse:
logging.info('Python HTTP trigger function processed a request.')
return func.WsgiMiddleware(app).handle(req, context)
It may be that you are creating a pipeline per request. You have a stack with several languages, it could be that one of the libraries has this functionality.
See the section on "Avoid creating lots of pipelines" in https://stanfordnlp.github.io/CoreNLP/memory-time.html#avoid-creating-lots-of-pipelines
I have Pub/Sub subscribe logic wrapped inside a subscribe method that is being called once during service initialization for every subscription:
def subscribe(self,
callback: typing.Callable,
subscription_name: str,
topic_name: str,
project_name: str = None) -> typing.Optional[SubscriberClient]:
"""Subscribes to Pub/Sub topic and return subscriber client
:param callback: subscription callback method
:param subscription_name: name of the subscription
:param topic_name: name of the topic
:param project_name: optional project name. Uses default project if not set
:return: subscriber client or None if testing
"""
project = project_name if project_name else self.pubsub_project_id
self.logger.info('Subscribing to project `{}`, topic `{}`'.format(project, topic_name))
project_path = self.pubsub_subscriber.project_path(project)
topic_path = self.pubsub_subscriber.topic_path(project, topic_name)
subscription_path = self.pubsub_subscriber.subscription_path(project, subscription_name)
# check if there is an existing subscription, if not, create it
if subscription_path not in [s.name for s in self.pubsub_subscriber.list_subscriptions(project_path)]:
self.logger.info('Creating new subscription `{}`, topic `{}`'.format(subscription_name, topic_name))
self.pubsub_subscriber.create_subscription(subscription_path, topic_path)
# subscribe to the topic
self.pubsub_subscriber.subscribe(
subscription_path, callback=callback,
scheduler=self.thread_scheduler
)
return self.pubsub_subscriber
This method is called like this:
self.subscribe_client = self.subscribe(
callback=self.pubsub_callback,
subscription_name='subscription_topic',
topic_name='topic'
)
The callback method does a bunch of stuff, sends 2 emails then acknowledges the message
def pubsub_callback(self, data: gcloud_pubsub_subscriber.Message):
self.logger.debug('Processing pub sub message')
try:
self.do_something_with_message(data)
self.logger.debug('Acknowledging the message')
data.ack()
self.logger.debug('Acknowledged')
return
except:
self.logger.warning({
"message": "Failed to process Pub/Sub message",
"request_size": data.size,
"data": data.data
}, exc_info=True)
self.logger.debug('Acknowledging the message 2')
data.ack()
When I run push something to the subscription, callback runs, prints all the debug messages including Acknowledged. The message, however, stays in the Pub/Sub, the callback gets called again and it takes exponential time after each retry. The question is what could cause the message to stay in the pub/sub even after the ack is called?
I have several such subscriptions, all of them work as expected. Deadline is not an option, the callback finishes almost immediately and I played with the ack deadline anyways, nothing helped.
When I try to process these messages from locally running app connected to that pub-sub, it completes just fine and acknowledge takes the message out of the queue as expected.
So the problem manifests only in deployed service (running inside a kubernetes pod)
Callback executes buck ack does seemingly nothing
Acking messages from a script running locally (...and doing the exact same stuff) or through the GCP UI works as expected.
Any ideas?
Acknowledgements are best-effort in Pub/Sub, so it's possible but unusual for messages to be redelivered.
If you are consistently receiving duplicates, it might be due to duplicate publishes of the same message contents. As far as Pub/Sub is concerned, these are different messages and will be assigned different message IDs. Check the Pub/Sub-provided message IDs to ensure that you are actually receiving the same message multiple times.
There is an edge case in dealing with large backlogs of small messages with streaming pull (which is what the Python client library uses). If you are running multiple clients subscribing on the same subscription, this edge case may be relevant.
You can also check your subscription's Stackdriver metrics to see:
if its acks are being sent successfully (subscription/ack_message_count)
if its backlog is decreasing (subscription/backlog_bytes)
if your subscriber is missing the ack deadline (subscription/streaming_pull_ack_message_operation_count filtered by response_code != "success")
If you're not missing the ack deadline and your backlog is remaining steady, you should contact Google Cloud support with your project name, subscription name, and a sample of the duplicate message IDs. They will be able to investigate why these duplicates are happening.
I did some additional testing and I finally found the problem.
TL;DR: I was using the same google.cloud.pubsub_v1.subscriber.scheduler.ThreadScheduler for all subscriptions.
Here are the snippets of the code I used to test it. This is the broken version:
server.py
import concurrent.futures.thread
import os
import time
from google.api_core.exceptions import AlreadyExists
from google.cloud import pubsub_v1
from google.cloud.pubsub_v1.subscriber.scheduler import ThreadScheduler
def create_subscription(project_id, topic_name, subscription_name):
"""Create a new pull subscription on the given topic."""
subscriber = pubsub_v1.SubscriberClient()
topic_path = subscriber.topic_path(project_id, topic_name)
subscription_path = subscriber.subscription_path(
project_id, subscription_name)
subscription = subscriber.create_subscription(
subscription_path, topic_path)
print('Subscription created: {}'.format(subscription))
def receive_messages(project_id, subscription_name, t_scheduler):
"""Receives messages from a pull subscription."""
subscriber = pubsub_v1.SubscriberClient()
subscription_path = subscriber.subscription_path(
project_id, subscription_name)
def callback(message):
print('Received message: {}'.format(message.data))
message.ack()
subscriber.subscribe(subscription_path, callback=callback, scheduler=t_scheduler)
print('Listening for messages on {}'.format(subscription_path))
project_id = os.getenv("PUBSUB_PROJECT_ID")
publisher = pubsub_v1.PublisherClient()
project_path = publisher.project_path(project_id)
# Create both topics
try:
topics = [topic.name.split('/')[-1] for topic in publisher.list_topics(project_path)]
if 'topic_a' not in topics:
publisher.create_topic(publisher.topic_path(project_id, 'topic_a'))
if 'topic_b' not in topics:
publisher.create_topic(publisher.topic_path(project_id, 'topic_b'))
except AlreadyExists:
print('Topics already exists')
# Create subscriptions on both topics
sub_client = pubsub_v1.SubscriberClient()
project_path = sub_client.project_path(project_id)
try:
subs = [sub.name.split('/')[-1] for sub in sub_client.list_subscriptions(project_path)]
if 'topic_a_sub' not in subs:
create_subscription(project_id, 'topic_a', 'topic_a_sub')
if 'topic_b_sub' not in subs:
create_subscription(project_id, 'topic_b', 'topic_b_sub')
except AlreadyExists:
print('Subscriptions already exists')
scheduler = ThreadScheduler(concurrent.futures.thread.ThreadPoolExecutor(10))
receive_messages(project_id, 'topic_a_sub', scheduler)
receive_messages(project_id, 'topic_b_sub', scheduler)
while True:
time.sleep(60)
client.py
import datetime
import os
import random
import sys
from time import sleep
from google.cloud import pubsub_v1
def publish_messages(pid, topic_name):
"""Publishes multiple messages to a Pub/Sub topic."""
publisher = pubsub_v1.PublisherClient()
topic_path = publisher.topic_path(pid, topic_name)
for n in range(1, 10):
data = '[{} - {}] Message number {}'.format(datetime.datetime.now().isoformat(), topic_name, n)
data = data.encode('utf-8')
publisher.publish(topic_path, data=data)
sleep(random.randint(10, 50) / 10.0)
project_id = os.getenv("PUBSUB_PROJECT_ID")
publish_messages(project_id, sys.argv[1])
I connected to the cloud pub/sub, the server created topics and subscriptions. Then I ran the client script multiple times in parallel for both topics. After a short while, once I changed server code to instantiate new thread scheduler inside receive_messages scope, the server cleaned up both topics and functioned as expected.
Confusing thing is that in either case, the server printed out the received message for all the messages.
I am going to post this to https://github.com/googleapis/google-cloud-python/issues
When I try to run my program I get "AttributeError: 'BackgroundScheduler' object has no attribute 'add_cron_job'"
When the program should correctly run.
Looking at the docs. It seems out of date. Its being ran on Linux.
import requests
import datetime
import time
from apscheduler.schedulers.background import BackgroundScheduler
# Provide the webhook URL that Discord generated
discord_webhook_url = 'super secrect URL'
Scheduler = Scheduler()
Scheduler.daemonic = False
Scheduler.start()
def job_function():
# Get the BTC price from CoinDesk
bitcoin_price_url = 'https://api.coindesk.com/v1/bpi/currentprice/BTC.json'
data = requests.get(bitcoin_price_url).json()
price_in_usd = data['bpi']['USD']['rate']
# Post the message to the Discord webhook
data = {
"content": "<#My Client ID> Bitcoin price is currently at $" + price_in_usd + " USD"
}
requests.post(discord_webhook_url, data=data)
Scheduler.add_cron_job(job_function, minute='0-59')
# the line above is where the error happenes.
It should just run. and complete the function of posting a message of the BTC price to discord.
I have a node.js application that runs a client interface which exposes action that triggers machine-learn tasks. Since python is a better choice when implementing machine-learn related stuff, I've implemented a python application that runs on demand machine learning tasks.
Now, I need to integrate both applications. It has been decided that we need to use a single (AWS) instance to integrate both applications.
One way found to do such integration was using python-shell node module. There, the communications between Python and Node are done by stdin and stdout.
On node I have something like this:
'use strict';
const express = require('express');
const PythonShell = require('python-shell');
var app = express();
app.listen(8000, function () {
console.log('Example app listening on port 8000!');
});
var options = {
mode: 'text',
pythonPath: '../pythonapplication/env/Scripts/python.exe',
scriptPath: '../pythonapplication/',
pythonOptions: ['-u'], // Unbuffered
};
var pyshell = new PythonShell('start.py', options);
pyshell.on('message', function (message) {
console.log(message);
});
app.get('/task', function (req, res) {
pyshell.send('extract-job');
});
app.get('/terminate', function (req, res) {
pyshell.send('terminate');
pyshell.end(function (err, code, signal) {
console.log(err)
console.log(code)
console.log(signal);
});
});
On python, I have a main script which loads some stuff and the calls a server script, that runs forever reading lines with sys.stdin.readline() and then executes the corresponding task.
start.py is:
if __name__ == '__main__':
# data = json.loads(sys.argv[1])
from multiprocessing import Manager, Pool
import logging
import provider, server
# Get logging setup objects
debug_queue, debug_listener = provider.shared_logging(logging.DEBUG, 'python-server-debug.log')
info_queue, info_listener = provider.shared_logging(logging.INFO, 'python-server.log')
logger = logging.getLogger(__name__)
# Start logger listener
debug_listener.start()
info_listener.start()
logger.info('Initializing pool of workers...')
pool = Pool(initializer=provider.worker, initargs=[info_queue, debug_queue])
logger.info('Initializing server...')
try:
server.run(pool)
except (SystemError, KeyboardInterrupt) as e:
logger.info('Execution terminated without errors.')
except Exception as e:
logger.error('Error on main process:', exc_info=True)
finally:
pool.close()
pool.join()
debug_listener.stop()
info_listener.stop()
print('Done.')
Both info_queue and debug_queue are multiprocessing.Queue to handle multiprocessing logging. If I run my python application as standalone, everything works fine, even when using the pool of workers (logs get properly logged, prints, get properly printed...)
But, if I try to run using python-shell, only my main process prints and logs get printed and logged correctly... Every message (print or log) from my pool of workers get held until I terminate the python script.
In other words, every message will get held until the finally step on server.py run...
Does anyone has any insights on this issue? Have you guys heard about python-bridge module? Is it a better solution? Can you suggest a better approach for such integration that does not uses two separated servers?
Here I post my real provider script, and a quick mock I did for the server script (the real one has too much stuff)
mock server.py:
import json
import logging
import multiprocessing
import sys
import time
from json.decoder import JSONDecodeError
from threading import Thread
def task(some_args):
logger = logging.getLogger(__name__)
results = 'results of machine learn task goes here, as a string'
logger.info('log whatever im doing')
# Some machine-learn task...
logger.info('Returning results.')
return results
def answer_node(message):
print(message)
# sys.stdout.write(message)
# sys.stdout.flush()
def run(pool, recrutai, job_pool, candidate_queue):
logger = logging.getLogger(__name__)
workers = []
logger.info('Server is ready and waiting for commands')
while True:
# Read input stream
command = sys.stdin.readline()
command = command.split('\n')[0]
logger.debug('Received command: %s', command)
if command == 'extract-job':
logger.info(
'Creating task.',
)
# TODO: Check data attributes
p = pool.apply_async(
func=task,
args=('args'),
callback=answer_node
)
# What to do with workers array?!
workers.append(p)
elif command == 'other-commands':
pass
# Other task here
elif command == 'terminate':
raise SystemError
else:
logger.warn(
'Received an invalid command %s.',
command
)
my provider.py:
import logging
import os
from logging.handlers import QueueHandler, QueueListener
from multiprocessing import Queue
def shared_logging(level, file_name):
# Create main logging file handler
handler = logging.FileHandler(file_name)
handler.setLevel(level)
# Create logging format
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# Create queue shared between all process to centralize logging features
logger_queue = Queue() # multiprocessing.Queue
# Create logger queue listener to send records from logger_queue to handler
logger_listener = QueueListener(logger_queue, handler)
return logger_queue, logger_listener
def process_logging(info_queue, debug_queue, logger_name=None):
# Create logging queue handlers
debug_queue_handler = QueueHandler(debug_queue)
debug_queue_handler.setLevel(logging.DEBUG)
info_queue_handler = QueueHandler(info_queue)
info_queue_handler.setLevel(logging.INFO)
# Setup level of process logger
logger = logging.getLogger()
if logger_name:
logger = logging.getLogger(logger_name)
logger.setLevel(logging.DEBUG)
# Add handlers to the logger
logger.addHandler(debug_queue_handler)
logger.addHandler(info_queue_handler)
def worker(info_queue, debug_queue):
# Setup worker process logging
process_logging(info_queue, debug_queue)
logging.debug('Process %s initialized.', os.getpid())