I learnt that Twitter has stopped providing JSON for deleted tweets.I am trying to get past this limitation by using a polling method to see if tweet is deleted.
But my code still fails. I would appreciate it if you can help me figure out what I am missing.
import sys
import json
import tweepy
from tweepy import Stream
from tweepy.streaming import StreamListener
import datetime
import time
from polling import TimeoutException, poll
# Go to http://apps.twitter.com and create an app.
# The consumer key and secret will be generated for you after
consumer_key = 'xx'
consumer_secret = 'xx'
access_token = 'xx'
access_token_secret = 'xx'
# Set up the authorisation to use the Twitter API
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
# Handle the output generated from the stream
class listener(StreamListener):
tweetcount = 0
def on_data(self, data):
# Convert the message to JSON
json_data = json.loads(data)
# text_file = open(json_data['id_str'] + ".json", "w")
# text_file.write(data)
# text_file.close()
if 'id_str' not in json_data:
# If this isn't a status, do nothing.
print("no ID")
else:
#print("Twitter Id ",json_data['id_str'])
#print("User Id ",json_data['user']['id_str'])
if json_data['user']['id_str'] == '51241574': #Associated Press
tweetcount = json_data['user']['statuses_count']
tweet = api.get_status(json_data['id'])
print("Tweet Count ",tweetcount)
print("Account Name ", json_data['user']['name'])
print(tweet.text)
else:
pass
# if 'delete' in json_data:
# print ("DELETED!")
# if json_data['delete']['status']['user_id'] == '51241574':
# deleted_tweet_id =json_data['delete']['status']['id']
# tweetcount -= 1
# print("New Count is ",tweetcount)
# print(deleted_tweet_id)
# deleted_tweet =api.get_status(deleted_tweet_id)
# print(deleted_tweet.text)
#
# else:
# pass
return True
def on_error(self, status):
print("Error status is ",status)
# Start consuming from the stream. This will get all the Tweets & Deletions from the users the user is following.
twitterStream = Stream(auth, listener())
twitterStream.filter(follow=['51241574'], async=True)
# polling method to check if tweet is deleted
try:
user = api.get_user('AP')
poll(lambda: user.statuses_count >= listener.tweetcount > 0, timeout=30, step=1)
print("Tweet was deleted,New Tweet count is ", user.statuses_count)
except Exception as ex:
template = "An exception of type {0} occurred. Arguments:\n{1!r}"
message = template.format(type(ex).__name__, ex.args)
print (message)
When a listener event is fired, the application shows the value in tweet count variable and checks it against the value retrieved from querying the api.
Related
I am trying to use Tweepy and streaming to track Tweets in real time. I am using the following which works fine:
import tweepy
import configparser
import sys
#read configs
config = configparser.ConfigParser()
config.read('config.ini')
api_key = config['twitter']['api_key']
api_key_secret = config['twitter']['api_key_secret']
access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']
class StreamCollector(tweepy.Stream):
def on_status(self, status):
if not hasattr(status, 'retweeted_status') and status.in_reply_to_screen_name == None and status.i\
s_quote_status == False:
if status.author.followers_count > 100000:
print('Twitter Handle: #'+status.author.screen_name)
print('Followers:',status.author.followers_count)
print('Tweet:',status.text)
print('\n')
#print(status.user.screen_name.encode('UTF-8'))
stream = StreamCollector(api_key,api_key_secret,access_token, access_token_secret)
stream.filter(track=["table"])
However, I want to produce the untruncated Tweet. I tried substituting status.text for status.full_text but I got the error:
AttributeError: 'Status' object has no attribute 'full_text'
My version of Tweepy is 4.5.0 and Python is 3.9.9.
The tweepy.API has a compatibility mode and extended mode. The extended mode should allow you to get the full text of the Tweet.
ref: Extended Tweets
Here is the code with the extended mode call.
import sys
import tweepy
import configparser
#read configs
config = configparser.ConfigParser()
config.read('config.ini')
api_key = config['twitter']['api_key']
api_key_secret = config['twitter']['api_key_secret']
access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']
class StreamCollector(tweepy.Stream):
def on_status(self, status):
if not hasattr(status, 'retweeted_status'):
if status.in_reply_to_screen_name is None and status.is_quote_status is False:
if status.author.followers_count > 100000:
print(f'Twitter Handle: #{status.author.screen_name}')
print(f'Followers: {status.author.followers_count}')
if 'extended_tweet' in status._json:
full_text = status._json['extended_tweet']['full_text']
print(f'Tweet: {full_text}')
elif 'extended_tweet' not in status._json:
print(f'Tweet: {status.text}')
print('\n')
stream = StreamCollector(api_key,api_key_secret,access_token, access_token_secret)
stream.filter(track=["table"])
Streaming is covered in Tweepy's documentation on extended Tweets:
By default, the Status objects from streams may contain an extended_tweet attribute representing the equivalent field in the raw data/payload for the Tweet. This attribute/field will only exist for extended Tweets, containing a dictionary of sub-fields. The full_text sub-field/key of this dictionary will contain the full, untruncated text of the Tweet
I am using python and Tweepy in a script that loops and prints the tweet id, the time it was created and the full text. The problem am having is that it keeps on printing the same id and full text if there is no new tweet, is it possible to have an if statement or a listener that will only print the id, time created and full text only once and wait until the user creates a new tweet.
Below is my code:
import tweepy
import tkinter
import time
CONSUMER_KEY= 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
CONSUMER_SECRET= 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
ACCESS_TOKEN= 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
ACCESS_TOKEN_SECRET= 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
# set parser=tweepy.parsers.JSONParser() if you want a nice printed json response.
user = api.me()
while True:
tweets = api.user_timeline(screen_name = "elonmusk",
count=1,
include_rts = False,
tweet_mode = 'extended'
)
for detailsInTweet in tweets:
print("ID: {}".format(detailsInTweet.id))
# Tweet creation time
print(detailsInTweet.created_at, "\n")
# Full tweet text
print(detailsInTweet.full_text, "\n")
print("\n")
time.sleep(5)
I am doing twitter streaming data by kafka. I managed to stream the data and consume the twitter json. But now how do i create a pyspark dataframe containing the twitter data and the search keyword?
Below is how i write the kafka producer
I managed to create the dataframe of what data i want from the twitter object. But i don't know how to get the search keyword.
class StdOutListener(StreamListener):
def __init__(self, producer):
self.producer_obj = producer
#on_status is activated whenever a tweet has been heard
def on_data(self, data):
try:
self.producer_obj.send("twitterstreamingdata", data.encode('utf-8'))
print(data)
return True
except BaseException as e:
print("Error on_data: %s" % str(e))
return True
# When an error occurs
def on_error(self, status):
print (status)
return True
# When reach the rate limit
def on_limit(self, track):
# Print rate limiting error
print("Rate limited, continuing")
# Continue mining tweets
return True
# When timed out
def on_timeout(self):
# Print timeout message
print(sys.stderr, 'Timeout...')
# Wait 10 seconds
time.sleep(120)
return True # To continue listening
def on_disconnect(self, notice):
#Called when twitter sends a disconnect notice
return
if __name__ == '__main__':
spark = SparkSession \
.builder \
.appName("Kafka Producer Application") \
.getOrCreate()
#This is the initialization of Kafka producer
producer = KafkaProducer(bootstrap_servers='xx.xxx.xxx.xxx:9092')
#This handles twitter auth and the conn to twitter streaming API
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, StdOutListener(producer))
print("Kafka Producer Application: ")
WORDS = input("Enter any words: ")
print ("Is this what you just said?", WORDS)
word = [u for u in WORDS.split(',')]
#This line filter twitter stream to capture data by keywords
stream.filter(track=word)
One way to resolve your problem it's changing StdOutListener class constructor to receive "keyword" parameter and add "keyword" to JSON in "on_data" function to send to Kafka
import json
import sys
import time
from kafka import KafkaProducer
from pyspark.sql import SparkSession
from tweepy import StreamListener, Stream, OAuthHandler
class StdOutListener(StreamListener):
def __init__(self, producer: KafkaProducer = None, keyword=None):
super(StreamListener, self).__init__()
self.producer = producer
self.keyword = keyword
# on_status is activated whenever a tweet has been heard
def on_data(self, data):
try:
data = json.loads(data)
data['keyword'] = self.keyword
data = json.dumps(data)
self.producer.send("twitterstreamingdata", data.encode('utf-8'))
return True
except BaseException as e:
print("Error on_data: %s" % str(e))
return True
# When an error occurs
def on_error(self, status):
print(status)
return True
# When reach the rate limit
def on_limit(self, track):
# Print rate limiting error
print("Rate limited, continuing")
# Continue mining tweets
return True
# When timed out
def on_timeout(self):
# Print timeout message
print(sys.stderr, 'Timeout...')
# Wait 10 seconds
time.sleep(120)
return True # To continue listening
def on_disconnect(self, notice):
# Called when twitter sends a disconnect notice
return
if __name__ == '__main__':
CONSUMER_KEY = 'YOUR CONSUMER KEY'
CONSUMER_SECRET = 'YOUR CONSUMER SECRET'
ACCESS_TOKEN = 'YOUR ACCESS TOKEN'
ACCESS_SECRET = 'YOUR ACCESS SECRET'
print("Kafka Producer Application: ")
words = input("Enter any words: ")
print("Is this what you just said?", words)
word = [u for u in words.split(',')]
spark = SparkSession \
.builder \
.appName("Kafka Producer Application") \
.getOrCreate()
# This is the initialization of Kafka producer
kafka_producer = KafkaProducer(bootstrap_servers='35.240.157.219:9092')
# This handles twitter auth and the conn to twitter streaming API
auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
stream = Stream(auth, StdOutListener(producer=kafka_producer, keyword=word))
stream.filter(track=word)
Hope it helps you!
I can't, for the life of me, figure out what is wrong with the following four lines of code.
def getAssetExistance(asset, element, table):
dynamoTable = dynamo.Table(table)
response = dynamoTable.query(KeyConditionExpression=Key(element).eq(asset)
return bool(response)
I am running this through aws Lambda and the log on cloudwatch is telling me the error is on the return line. This is the error (line 24 is the return line):
Syntax error in module 'lambda_function': invalid syntax (lambda_function.py, line 24)
In case this helps at all, here is the rest of the code:
################################
# Slack Lambda handler.
################################
import boto3
import os
import logging
import urllib
# Grab data from the environment.
BOT_TOKEN = os.environ["BOT_TOKEN"]
ASSET_TABLE = os.environ["ASSET_TABLE"]
REGION_NAME = os.getenv('REGION_NAME', 'us-east-1')
dynamo = boto3.client('dynamodb', region_name=REGION_NAME)
# Define the URL of the targeted Slack API resource.
SLACK_URL = "https://slack.com/api/chat.postMessage"
def getAssetExistance(asset, element, table):
dynamoTable = dynamo.Table(table)
response = dynamoTable.query(KeyConditionExpression=Key(element).eq(asset)
return bool(response)
def lambda_handler(data, context):
# Slack challenge answer.
if "challenge" in data:
return data["challenge"]
# Grab the Slack channel data.
slack_event = data['event']
slack_user = slack_event["user"]
slack_text = slack_event["text"]
channel_id = slack_event["channel"]
slack_userID = slack_user["ID"]
slack_reply = ""
# Ignore bot messages.
if "bot_id" in slack_event:
logging.warn("Ignore bot event")
else:
# Start data sift.
if slack_text.startswith("!networth"):
slack_reply = "Your networth is: "
elif slack_text.startwith("!price"):
command,asset = text.split()
slack_reply = "The price of a(n) %s is: " % (asset)
elif slack_text.startwith("!Addme"):
if not getAssetExistance(slack_userID, userID, ASSET_TABLE):
slack_reply = "Adding user: %s(%s)" % (slack_user, slack_userID)
dynamo.update_item(TableName=ASSET_TABLE,
Key={'userID':{'S':'slack_userID'},
AttributeUpdates= {
'resources':{
'Action': 'ADD',
'Value': {'N': '1000'}
}
}
)
else
slack_reply = "User %s(%s) already exists" % (slack_user, slack_userID)
# We need to send back three pieces of information:
data = urllib.parse.urlencode(
(
("token", BOT_TOKEN),
("channel", channel_id),
("text", slack_reply)
)
)
data = data.encode("ascii")
# Construct the HTTP request that will be sent to the Slack API.
request = urllib.request.Request(
SLACK_URL,
data=data,
method="POST"
)
# Add a header mentioning that the text is URL-encoded.
request.add_header(
"Content-Type",
"application/x-www-form-urlencoded"
)
# Fire off the request!
urllib.request.urlopen(request).read()
# Everything went fine.
return "200 OK"
Hopefully I am doing something dumb; I am pretty new to all this. Any help is much appreciated it. Thanks!
You skipped closed round bracket in this line:
response = dynamoTable.query(KeyConditionExpression=Key(element).eq(asset)
replace this line by:
response = dynamoTable.query(KeyConditionExpression=Key(element)).eq(asset)
I am running a code that gets tweets from the Twitter API and saving them to a txt file. the code is as follows:
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
import sentmod as s
#consumer key, consumer secret, access token, access secret.
ckey= "xxxxxxxxxxxxxxxxxxx"
csecret="xxxxxxxxxxxxxxxxxxx"
atoken="xxxxxxxxxxxxxxxxxxx"
asecret="xxxxxxxxxxxxxxxxxxx"
class listener(StreamListener):
def on_data(self, data):
all_data = json.loads(data)
tweet = all_data["text"]
sentiment_value, confidence = s.sentiment(tweet)
tweet.encode('ascii', 'ignore')
tweets= open("tweets.txt","a",encoding="utf-8")
tweets.write(tweet)
tweets.write('\n\n\n')
tweets.close()
print(tweet, sentiment_value, confidence)
if confidence*100 >= 60:
output = open("twitter-out.txt","a")
output.write(sentiment_value)
output.write('\n\n\n')
output.close()
return True
def on_error(self, status):
print(status)
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, listener())
twitterStream.filter(track=["Car"],languages=['en']) #locations=[]
When I add the tweets to a text file the tweets sometimes get repeated tweets, how can I fix that?