I have one producer which is sending the events to Event hub.
I want to create 2 receivers to receive events from eventhub. How to implement that.
The code for receiver:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.eventhub import EventHubSharedKeyCredential, EventData, EventHubConsumerClient
from azure.core.exceptions import ResourceExistsError
from azure.eventhub.extensions.checkpointstoreblob import BlobCheckpointStore
#Eventhub access credentials
connection_str = ****
consumer_group = '$Default'
eventhub_name = ****
#Blobstorage Storage credentials
storage_connection_str = ****
container_name = ****
storageAccount = ****
#For checkpointing in Blob storage
checkpoint_store = BlobCheckpointStore.from_connection_string(storage_connection_str, container_name)
#Initiate BlobServiceClient to access the Blob storage
blob_service_client = BlobServiceClient.from_connection_string(storage_connection_str)
container_client = blob_service_client.get_container_client('container_name') #Dump final data to the Blob storage in append mode.
try:
container_client.create_container() #Create new Container in the service
properties = container_client.get_container_properties()
except ResourceExistsError:
print("Container already exists.")
#Instantiate a new BlobClient
#blob_client = container_client.get_blob_client("data.csv")
def get_messages():
client = EventHubConsumerClient.from_connection_string(connection_str, consumer_group, eventhub_name=eventhub_name)
def on_event_batch(partition_context, events):
#log.info("Partition {}, Received count: {}".format(partition_context.partition_id, len(events)))
print("Received event from partition {}".format(
partition_context.partition_id)) # Since no partition is defined so partition = 0 by default.
if (len(events) == 0):
client.close() # closing the client if there is no event triggered.
else:
for event in events:
list_ = event.body_as_json()
# Update checkpoint
partition_context.update_checkpoint()
try:
with client:
client.receive_batch(
on_event_batch=on_event_batch,
PARTITION="0",)
#starting_position="-1", ) # "-1" is from the beginning of the partition.
except KeyboardInterrupt:
print('Stopped receiving.')
get_messages()
I have created 2 copies of this code with names consumer1.py and consumer2.py. But both these consumers receive the same events every time.
So for example I send 100 events then I want these two consumers to run in parallel and divide those 100 events among themselves and avoiding duplicates. How to achieve this?
So finally I found the solution to create multiple consumers under the same consumer group which can consume the events parallelly and also should share the load among each other.
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.eventhub import EventHubSharedKeyCredential, EventData, EventHubConsumerClient
from azure.core.exceptions import ResourceExistsError
from azure.eventhub.extensions.checkpointstoreblob import BlobCheckpointStore
#Eventhub access credentials
connection_str = ****
consumer_group = '$Default'
eventhub_name = ****
#Blobstorage Storage credentials
storage_connection_str = ****
container_name = ****
storageAccount = ****
#For checkpointing in Blob storage
checkpoint_store = BlobCheckpointStore.from_connection_string(storage_connection_str, container_name)
#Initiate BlobServiceClient to access the Blob storage
blob_service_client = BlobServiceClient.from_connection_string(storage_connection_str)
container_client = blob_service_client.get_container_client('nsc-container')
#Dump final data to the Blob storage in append mode.
try:
container_client.create_container() #Create new Container in the service
properties = container_client.get_container_properties()
except ResourceExistsError:
print("Container already exists.")
#Instantiate a new BlobClient
#blob_client = container_client.get_blob_client("data.csv")
def get_messages():
client = EventHubConsumerClient.from_connection_string(connection_str, consumer_group, eventhub_name=eventhub_name, checkpoint_store=checkpoint_store,)
def on_event_batch(partition_context, events):
#log.info("Partition {}, Received count: {}".format(partition_context.partition_id, len(events)))
print("Received event from partition {}".format(
partition_context.partition_id)) # Since no partition is defined so partition = 0 by default.
line_count = 0
start_time = time.time()
cnt = 0
if (len(events) == 0):
client.close() # closing the client if there is no event triggered.
else:
for event in events:
list_ = event.body_as_json()
cnt += 1
# Update checkpoint
partition_context.update_checkpoint()
print("Number of events received: ",cnt)
line_count = line_count+ cnt
end_time = time.time()
run_time = end_time - start_time
print("\nTotal Received {} records in {} seconds.".format(line_count, run_time))
try:
with client:
client.receive_batch(
on_event_batch=on_event_batch,) # With specified partition_id, load-balance will be disabled
except KeyboardInterrupt:
print('Stopped receiving.')
get_messages()
Now create as many copies of the code and save them as consumer_1.py and so on. Also, make sure to keep the number of partitions equal to the number of consumers for best efficiency.
Related
The Api receive the file, than tries to create an unique blob name.
Than I upload in chunks of 4MB to the blob. Each chunk takes something about 8 seconds, is this normal? My upload speed is 110Mbps. I tried uploading a 50MB file and it took almost 2 minutes. I don't know if the azure_blob_storage version is related to this, I'm using azure-storage-blob==12.14.1
import uuid
import os
from azure.storage.blob import BlobClient, BlobBlock, BlobServiceClient
import time
import uuid
#catalog_api.route("/catalog", methods=['POST'])
def catalog():
file = request.files['file']
url_bucket, file_name, file_type = upload_to_blob(file)
def upload_to_blob(self, file):
file_name = file.filename
file_type = file.content_type
blob_client = self.generate_blob_client(file_name)
blob_url = self.upload_chunks(blob_client, file)
return blob_url, file_name, file_type
def generate_blob_client(self, file_name: str):
blob_service_client = BlobServiceClient.from_connection_string(self.connection_string)
container_client = blob_service_client.get_container_client(self.container_name)
for _ in range(self.max_blob_name_tries):
blob_name = self.generate_blob_name(file_name)
blob_client = container_client.get_blob_client(blob_name)
if not blob_client.exists():
return blob_client
raise Exception("Couldnt create the blob")
def upload_chunks(self, blob_client: BlobClient, file):
block_list=[]
chunk_size = self.chunk_size
while True:
read_data = file.read(chunk_size)
if not read_data:
print("uploaded")
break
print("uploading")
blk_id = str(uuid.uuid4())
blob_client.stage_block(block_id=blk_id,data=read_data)
block_list.append(BlobBlock(block_id=blk_id))
blob_client.commit_block_list(block_list)
return blob_client.url
```
I tried in my environment and got below results:
I tried with 50 mb file to upload blob storage account with chunk size of 4*1024*1024 from local environment to storage account it takes 45 secs.
Code:
import uuid
from azure.storage.blob import BlobBlock, BlobServiceClient
import time
connection_string="<storage account connection string >"
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client('test')
blob_client = container_client.get_blob_client("file.pdf")
start=time.time()
#upload data
block_list=[]
chunk_size=4*1024*1024
with open("C:\\file.pdf",'rb') as f:
while True:
read_data = f.read(chunk_size)
if not read_data:
break # done
blk_id = str(uuid.uuid4())
blob_client.stage_block(block_id=blk_id,data=read_data)
block_list.append(BlobBlock(block_id=blk_id))
blob_client.commit_block_list(block_list)
end=time.time()
print("Time taken to upload blob:", end - start, "secs")
In the above code, I added the timing method of both start and end at end of code I used the end-start process to know the timing of uploaded file in blob storage.
Console:
Make sure your internet speed is good and also, I tried with some other internet speed it takes maximum 78secs.
Portal:
I've got a Python script that implements the Paho library to subscribe to and republish MQTT topics coming from a Mosquitto server. My script works fine when I invoke it from the terminal. What is tripping me up is trying to daemonize the script to run unattended. I'm using the daemonize library and while I can get the script to start as daemon, it doesn't actually do anything. When I run the non-daemonized script, I see the MQTT messages received at the endpoint I've specified, but when the same script is running as a daemon, no messages are showing up at my receiving endpoint. I'm including both the non-daemonized and daemonized scripts for reference.
Any ideas as to why the non-daemonized script works but the daemonized script does not?
Non-Daemonized
import paho.mqtt.client as mqtt
import random
import json
import time
#replace [device-id] with your device you created in IoT Hub.
iothubmqtttopic = "devices/MoxieSensorsBHM/messages/events/"
#define on_connect function
def on_connect(client, userdata, flags, rc):
print("Connected with result code " + str(rc))
client.subscribe("MW/oneal/Tag/#")
#define on_message function - this function translates the incoming topic
#and publishes to the specified topic for IoT Hub
def on_message(client, userdata, message):
global iothubmqtttopic
topic = message.topic
if(topic != iothubmqtttopic):
#extract the sensor ID from the topic
splitTopic = topic.split("/") #split the topic into a list
sensorID = splitTopic[3] #the 3rd item in the list is the sensor ID
msgType = splitTopic[4] #the 4th item in the list is the type (e.g. status, UWB)
#convert the json response to a python dictionary object
m_decode = str(message.payload.decode("utf-8"))
m_in = json.loads(m_decode)
#get current time for timestamp
ts = time.gmtime()
tsFormatted = time.strftime("%Y-%m-%d %H:%M:%S", ts)
#add new elements to the dictionary for topic and sensor ID
m_in["topic"] = topic
m_in["sensorID"] = sensorID
m_in["messageType"] = msgType
m_in["timestamp"] = tsFormatted
#set the propertybag for IoT Hub
propertyBag = "topic=" + topic + "&sensorID=" + sensorID + "&messageType=" + msgType
print(propertyBag)
#convert back to JSON
m_encode = json.dumps(m_in)
#create an array of all the possible sensor IDs
#print to screen and publish to IoT Hub
print("Topic: ", topic)
print("Message received: ", m_encode)
print("IoT Hub Topic: ", iothubmqtttopic)
client.publish(iothubmqtttopic, m_encode)
# replace <broker address> with the FQDN or IP address of your MQTT broker
broker_address="localhost"
#create the client and connect to the broker
print("creating new instance")
client = mqtt.Client("iottopicxlate" + str(random.randrange(1,1000000))) #create new instance
client.on_message=on_message #attach function to callback
print("connecting to broker")
client.connect(broker_address) #connect to broker
client.on_connect = on_connect #attach function to callback
client.loop_forever() #stop the loop
Daemonized
import paho.mqtt.client as mqtt
import random
import json
import time
import os, sys
from daemonize import Daemonize
def main():
#replace [device-id] with your device you created in IoT Hub.
iothubmqtttopic = "devices/MoxieSensorsBHM/messages/events/"
#define on_connect function
def on_connect(client, userdata, flags, rc):
#print("Connected with result code " + str(rc))
client.subscribe("MW/oneal/Tag/#")
#define on_message function - this function translates the incoming topic
#and publishes to the specified topic for IoT Hub
def on_message(client, userdata, message):
global iothubmqtttopic
topic = message.topic
if(topic != iothubmqtttopic):
#extract the sensor ID from the topic
splitTopic = topic.split("/") #split the topic into a list
sensorID = splitTopic[3] #the 3rd item in the list is the sensor ID
msgType = splitTopic[4] #the 4th item in the list is the type (e.g. status, UWB)
#convert the json response to a python dictionary object
m_decode = str(message.payload.decode("utf-8"))
m_in = json.loads(m_decode)
#get current time for timestamp
ts = time.gmtime()
tsFormatted = time.strftime("%Y-%m-%d %H:%M:%S", ts)
#add new elements to the dictionary for topic and sensor ID
m_in["topic"] = topic
m_in["sensorID"] = sensorID
m_in["messageType"] = msgType
m_in["timestamp"] = tsFormatted
#set the propertybag for IoT Hub
propertyBag = "topic="+topic+"&sensorID="+sensorID+"&messageType="+msgType
#print(propertyBag)
#convert back to JSON
m_encode = json.dumps(m_in)
#print to screen and publish to IoT Hub
print("Topic: ", topic + propertyBag)
print("Message received: ", m_encode)
client.publish(iothubmqtttopic, m_encode)
# replace <broker address> with the FQDN or IP address of your MQTT broker
broker_address="localhost"
#create the client and connect to the broker
#print("creating new instance")
client = mqtt.Client("iottopicxlate" + str(random.randrange(1,1000000))) #create new instance
client.on_message=on_message #attach function to callback
#print("connecting to broker")
client.connect(broker_address) #connect to broker
client.on_connect = on_connect #attach function to callback
client.loop_forever() #stop the loop
#start the daemon
topictransdpid = os.path.basename(sys.argv[0])
pidfile = "topictransd.pid"
daemon = Daemonize(app = "topictransd", pid = pidfile, action = main)
daemon.start()
Update
Vincent, I'm trying to implement your suggestion to write to a file for debug. Please bear with me as I'm learning Python on the fly. I've added the following code snip to the on_message function in the non-daemonized (i.e. working) version of the script and I see the messages written to text files in my "debug" directory. When I implement the same snip in the daemonized version, no files are written. So while my daemon is running, it's not actually doing anything.
f = open("debug/" + sensorID+tsFormatted + ".txt", "x")
f.write(m_encode)
f.close
Any ideas as to what I'm missing?
Update 2
I've implemented a simple logger to write a debug message when the script starts and and another to write a message when daemonize calls the main() function. My log file has an entry for the script starting, but there is not an entry for when main() is called - it's as though daemonize isn't executing the main() function? Here is my updated daemon script with the logger enabled:
import paho.mqtt.client as mqtt
import random
import json
import time
import logging
import os, sys
from daemonize import Daemonize
def main():
logger.warning("main has been started") # write a log file entry to indicate main has been called by daemonize
#replace [device-id] with your device you created in IoT Hub.
iothubmqtttopic = "devices/MoxieSensorsBHM/messages/events/"
#define on_connect function
def on_connect(client, userdata, flags, rc):
#print("Connected with result code " + str(rc))
client.subscribe("MW/oneal/Tag/#")
#define on_message function - this function translates the incoming topic
#and publishes to the specified topic for IoT Hub
def on_message(client, userdata, message):
global iothubmqtttopic
topic = message.topic
if(topic != iothubmqtttopic):
#extract the sensor ID from the topic
splitTopic = topic.split("/") #split the topic into a list
sensorID = splitTopic[3] #the 3rd item in the list is the sensor ID
msgType = splitTopic[4] #the 4th item in the list is the type (e.g. status, UWB)
#convert the json response to a python dictionary object
m_decode = str(message.payload.decode("utf-8"))
m_in = json.loads(m_decode)
#get current time for timestamp
ts = time.gmtime()
tsFormatted = time.strftime("%Y-%m-%d %H:%M:%S", ts)
#add new elements to the dictionary for topic and sensor ID
m_in["topic"] = topic
m_in["sensorID"] = sensorID
m_in["messageType"] = msgType
m_in["timestamp"] = tsFormatted
#set the propertybag for IoT Hub
propertyBag = "topic="+topic+"&sensorID="+sensorID+"&messageType="+msgType
#print(propertyBag)
#convert back to JSON
m_encode = json.dumps(m_in)
#print to screen and publish to IoT Hub
#print("Topic: ", topic + propertyBag)
#print("Message received: ", m_encode)
client.publish(iothubmqtttopic, m_encode)
#write the message to a debug file
f = open("debug/" + sensorID+tsFormatted + ".txt", "w")
f.write(m_encode)
f.close
# replace <broker address> with the FQDN or IP address of your MQTT broker
broker_address="localhost"
#create the client and connect to the broker
#print("creating new instance")
client = mqtt.Client("iottopicxlate" + str(random.randrange(1,1000000))) #create new instance
#create a logger
#logging.basicConfig(level=logging.DEBUG, filename="topictransd.log", format="%(asctime)s %(message)s", filemode="w")
#logger = logging.getLogger()
#client.enable_logger(logger)
client.on_message=on_message #attach function to callback
#print("connecting to broker")
client.connect(broker_address) #connect to broker
client.on_connect = on_connect #attach function to callback
#start the loop
client.loop_forever()
#start the daemon
logging.basicConfig(level=logging.DEBUG, filename="topictransd.log", format="%(asctime)s %(message)s", filemode="w")
logger = logging.getLogger()
pidfile = "topictransd.pid"
logger.warning("successfully started the script") # write a log file entry to indicate the script has successfully started
daemon = Daemonize(app = "topictransd", pid = pidfile, action = main)
daemon.start()
I have one topic and one subscription with multiple subscribers. My application scenario is I want to process messages on different subscribers with specific number of messages to be processed at a time. Means at first suppose 8 messages are processing then if one message processing done then after acknowledging processed message next message should take from the topic while taking care of no duplicate message to be found on any subscriber and every time 8 message should processed in the background.
For this I have use synchronous pull method with max_messages = 8 but next pulling is done after all messages process completed. So we have created own scheduler where at same time 8 process should be running at background and pulling 1 message at a time but still after all 8 message processing completed next message is delivered.
Here is my code:
#!/usr/bin/env python3
import logging
import multiprocessing
import time
import sys
import random
from google.cloud import pubsub_v1
project_id = 'xyz'
subscription_name = 'abc'
NUM_MESSAGES = 4
ACK_DEADLINE = 50
SLEEP_TIME = 20
multiprocessing.log_to_stderr()
logger = multiprocessing.get_logger()
logger.setLevel(logging.INFO)
def worker(msg):
logger.info("Received message:{}".format(msg.message.data))
random_sleep = random.randint(200,800)
logger.info("Received message:{} for {} sec".format(msg.message.data, random_sleep))
time.sleep(random_sleep)
def message_puller():
subscriber = pubsub_v1.SubscriberClient()
subscription_path = subscriber.subscription_path(project_id, subscription_name)
while(True):
try:
response = subscriber.pull(subscription_path, max_messages=1)
message = response.received_messages[0]
msg = message
ack_id = message.ack_id
process = multiprocessing.Process(target=worker, args=(message,))
process.start()
while process.is_alive():
# `ack_deadline_seconds` must be between 10 to 600.
subscriber.modify_ack_deadline(subscription_path,[ack_id],ack_deadline_seconds=ACK_DEADLINE)
time.sleep(SLEEP_TIME)
# Final ack.
subscriber.acknowledge(subscription_path, [ack_id])
logger.info("Acknowledging message: {}".format(msg.message.data))
except Exception as e:
print (e)
continue
def synchronous_pull():
p = []
for i in range(0,NUM_MESSAGES):
p.append(multiprocessing.Process(target=message_puller))
for i in range(0,NUM_MESSAGES):
p[i].start()
for i in range(0,NUM_MESSAGES):
p[i].join()
if __name__ == '__main__':
synchronous_pull()
Also for sometime subscriber.pull not pulling any messages even the while loop is always True. It gives me error as
list index (0) out of range
Concluding that subscriber.pull not pulling in message even messages are on the topic but after sometime it starts pulling. Why it is so?
I have tried with asynchronous pulling and flow control but duplicate message are found on multiple subscriber. If any other method will resolve my issue then let mi know. Thanks in advance.
Google Cloud PubSub ensures At least Once (docs). Which means, the messages may be delivered more than once. To tackle this, you need to make your program/system idempotent
You have multiple subscribers pulling 8 messages each.
To avoid the same message getting processed by multiple subscribers, acknowledge the message as soon as any subscriber pulls that message and proceeds further for processing rather than acknowledging it at the end, after the entire processing of the message.
Also, instead of running your main script continuously, use sleep for some constant time when there are no messages in the queue.
I had a similar code, where I used synchronous pull except I did not use parallel processing.
Here's the code:
PubSubHandler - Class to handle Pubsub related operations
from google.cloud import pubsub_v1
from google.api_core.exceptions import DeadlineExceeded
class PubSubHandler:
def __init__(self, subscriber_config):
self.project_name = subscriber_config['PROJECT_NAME']
self.subscriber_name = subscriber_config['SUBSCRIBER_NAME']
self.subscriber = pubsub_v1.SubscriberClient()
self.subscriber_path = self.subscriber.subscription_path(self.project_name,self.subscriber_name)
def pull_messages(self,number_of_messages):
try:
response = self.subscriber.pull(self.subscriber_path, max_messages = number_of_messages)
received_messages = response.received_messages
except DeadlineExceeded as e:
received_messages = []
print('No messages caused error')
return received_messages
def ack_messages(self,message_ids):
if len(message_ids) > 0:
self.subscriber.acknowledge(self.subscriber_path, message_ids)
return True
Utils - Class for util methods
import json
class Utils:
def __init__(self):
pass
def decoded_data_to_json(self,decoded_data):
try:
decoded_data = decoded_data.replace("'", '"')
json_data = json.loads(decoded_data)
return json_data
except Exception as e:
raise Exception('error while parsing json')
def raw_data_to_utf(self,raw_data):
try:
decoded_data = raw_data.decode('utf8')
return decoded_data
except Exception as e:
raise Exception('error converting to UTF')
Orcestrator - Main script
import time
import json
import logging
from utils import Utils
from db_connection import DbHandler
from pub_sub_handler import PubSubHandler
class Orcestrator:
def __init__(self):
self.MAX_NUM_MESSAGES = 2
self.SLEEP_TIME = 10
self.util_methods = Utils()
self.pub_sub_handler = PubSubHandler(subscriber_config)
def main_handler(self):
to_ack_ids = []
pulled_messages = self.pub_sub_handler.pull_messages(self.MAX_NUM_MESSAGES)
if len(pulled_messages) < 1:
self.SLEEP_TIME = 1
print('no messages in queue')
return
logging.info('messages in queue')
self.SLEEP_TIME = 10
for message in pulled_messages:
raw_data = message.message.data
try:
decoded_data = self.util_methods.raw_data_to_utf(raw_data)
json_data = self.util_methods.decoded_data_to_json(decoded_data)
print(json_data)
except Exception as e:
logging.error(e)
to_ack_ids.append(message.ack_id)
if self.pub_sub_handler.ack_messages(to_ack_ids):
print('acknowledged msg_ids')
if __name__ == "__main__":
orecestrator = Orcestrator()
print('Receiving data..')
while True:
orecestrator.main_handler()
time.sleep(orecestrator.SLEEP_TIME)
I've been trying so many different github project examples and read the bigtable api guide multiple times. I can't figure out why it doesn't allow me to set multiple cells in a row. In the examples shown they only have examples with one value per row.
I also used the cbt commands to see if the column families I added are in the table and they are in the table but when I used the count command I see no entries.
I've used both the mutate_rows command for the table and the commit command on the row but neither add the row.
I also do realize that the row commit command is literally just:
table.mutate_rows([row])
So, I cant seem to understand what I'm doing wrong at all.
import base64
import json
import ast
import datetime
from google.cloud import bigtable
from google.cloud.bigtable import column_family
from google.cloud.bigtable import row_filters
def function(event, context):
data = base64.b64decode(event['data']).decode('utf-8')
data = ast.literal_eval(data)
print(type(data))
print(data)
# Create a Cloud Bigtable client.
client = bigtable.Client(project=project_id, admin=True)
# Connect to an existing Cloud Bigtable instance.
instance = client.instance(instance_id)
print('opening the {} table.'.format(table_id))
table = instance.table(table_id)
# [START writing_rows]
max_versions_rule = column_family.MaxVersionsGCRule(2)
column_family_id = 'states'.encode('utf-8')
column_families = {column_family_id: max_versions_rule}
if not table.exists():
table.create(column_families=column_families)
else:
print("Table {} already exists.".format(table_id))
row_key = (data['serial_num'] + str(datetime.datetime.utcnow())).encode('utf-8')
row_obj = table.row(row_key)
for key, value in data.items():
row_obj.set_cell(
column_family_id,
str(key).encode('utf-8'),
str(value).encode('utf-8'),
timestamp=datetime.datetime.utcnow()
)
print(row_obj)
print(str(row_obj))
print(row_obj.table)
print(row_obj.row_key)
row_obj.commit()
'''
table.mutate_rows([row_obj])
'''
print('Inserted/updated data.')
# [END writing_rows]
# [START creating_a_filter]
# Create a filter to only retrieve the most recent version of the cell
# for each column across entire row.
row_filter = row_filters.CellsColumnLimitFilter(1)
# [END creating_a_filter]
# [START read_rows]
row = table.read_row(row_key, row_filter)
print(row)
for key, value in data.items():
cell_values = row.cells[column_family_id][column][0]
print('{} = {} should be {}'.format(key, cell_values, value))
# [END read_rows]
This is the solution I ended up with
import base64
import json
import ast
import datetime
from google.cloud import bigtable
from google.cloud.bigtable import column_family
from google.cloud.bigtable import row_filters
def hello_pubsub(event, context):
data = base64.b64decode(event['data']).decode('utf-8')
data = ast.literal_eval(data)
print(type(data))
print(data)
# Create a Cloud Bigtable client.
client = bigtable.Client(project=project_id, admin=True)
# Connect to an existing Cloud Bigtable instance.
instance = client.instance(instance_id)
print('opening the {} table.'.format(table_id))
table = instance.table(table_id)
# [START writing_rows]
max_versions_rule = column_family.MaxVersionsGCRule(2)
column_family_id = 'state'
column_families = {column_family_id: max_versions_rule}
if not table.exists():
table.create(column_families=column_families)
else:
print("Table {} already exists.".format(table_id))
row_key = (data['serial_num'] + " " + str(datetime.datetime.utcnow())).encode('utf-8')
rows = []
for key, value in data.items():
row = table.row(row_key)
row.set_cell(column_family_id,
str(key).encode('utf-8'),
str(value),
timestamp=datetime.datetime.utcnow())
rows.append(row)
table.mutate_rows(rows)
print('Inserted/updated data.')
# [END writing_rows]
# [START creating_a_filter]
# Create a filter to only retrieve the most recent version of the cell
# for each column across entire row.
row_filter = row_filters.CellsColumnLimitFilter(1)
# [END creating_a_filter]
# [START read_rows]
partial_rows = table.read_row(row_key, row_filter)
print(partial_rows.cells)
for key, value in data.items():
cell_value = partial_rows.cell_value(column_family_id, str(key).encode('utf-8'))
print('{} = {} should be {}'.format(key, cell_value, value))
# [END read_rows]
I learnt that Twitter has stopped providing JSON for deleted tweets.I am trying to get past this limitation by using a polling method to see if tweet is deleted.
But my code still fails. I would appreciate it if you can help me figure out what I am missing.
import sys
import json
import tweepy
from tweepy import Stream
from tweepy.streaming import StreamListener
import datetime
import time
from polling import TimeoutException, poll
# Go to http://apps.twitter.com and create an app.
# The consumer key and secret will be generated for you after
consumer_key = 'xx'
consumer_secret = 'xx'
access_token = 'xx'
access_token_secret = 'xx'
# Set up the authorisation to use the Twitter API
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
# Handle the output generated from the stream
class listener(StreamListener):
tweetcount = 0
def on_data(self, data):
# Convert the message to JSON
json_data = json.loads(data)
# text_file = open(json_data['id_str'] + ".json", "w")
# text_file.write(data)
# text_file.close()
if 'id_str' not in json_data:
# If this isn't a status, do nothing.
print("no ID")
else:
#print("Twitter Id ",json_data['id_str'])
#print("User Id ",json_data['user']['id_str'])
if json_data['user']['id_str'] == '51241574': #Associated Press
tweetcount = json_data['user']['statuses_count']
tweet = api.get_status(json_data['id'])
print("Tweet Count ",tweetcount)
print("Account Name ", json_data['user']['name'])
print(tweet.text)
else:
pass
# if 'delete' in json_data:
# print ("DELETED!")
# if json_data['delete']['status']['user_id'] == '51241574':
# deleted_tweet_id =json_data['delete']['status']['id']
# tweetcount -= 1
# print("New Count is ",tweetcount)
# print(deleted_tweet_id)
# deleted_tweet =api.get_status(deleted_tweet_id)
# print(deleted_tweet.text)
#
# else:
# pass
return True
def on_error(self, status):
print("Error status is ",status)
# Start consuming from the stream. This will get all the Tweets & Deletions from the users the user is following.
twitterStream = Stream(auth, listener())
twitterStream.filter(follow=['51241574'], async=True)
# polling method to check if tweet is deleted
try:
user = api.get_user('AP')
poll(lambda: user.statuses_count >= listener.tweetcount > 0, timeout=30, step=1)
print("Tweet was deleted,New Tweet count is ", user.statuses_count)
except Exception as ex:
template = "An exception of type {0} occurred. Arguments:\n{1!r}"
message = template.format(type(ex).__name__, ex.args)
print (message)
When a listener event is fired, the application shows the value in tweet count variable and checks it against the value retrieved from querying the api.