from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import tweepy
import textblob
import re
from textblob import TextBlob
import pandas as pd
import numpy as np
ACCESS_TOKEN="XXXX"
ACCESS_SECRET="XXXX"
CONSUMER_KEY="XXXX"
CONSUMER_SECRET="XXXX"
def twitter_setup():
"""
Utility function to setup the Twitter's API
with our access keys provided.
"""
# Authentication and access using keys:
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
# Return API with authentication:
api = tweepy.API(auth)
return api
extractor = twitter_setup()
tweets = extractor.user_timeline(screen_name="realDonaldTrump", count=200)
data = pd.DataFrame(data=[tweet.text for tweet in tweets], columns=['Tweets'])
data['len'] = np.array([len(tweet.text) for tweet in tweets])
data['ID'] = np.array([tweet.id for tweet in tweets])
data['Date'] = np.array([tweet.created_at for tweet in tweets])
data['Source'] = np.array([tweet.source for tweet in tweets])
data['Likes'] = np.array([tweet.favorite_count for tweet in tweets])
data['RTs'] = np.array([tweet.retweet_count for tweet in tweets])
def clean_tweet(tweet):
'''
Utility function to clean the text in a tweet by removing
links and special characters using regex.
'''
return ' '.join(re.sub("(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
def analize_sentiment(tweet):
'''
Utility function to classify the polarity of a tweet
using textblob.
'''
analysis = TextBlob(clean_tweet(tweet))
#print(analysis.sentiment.polarity)
if analysis.sentiment.polarity > 0:
return 1
elif analysis.sentiment.polarity == 0:
return 0
else:
return -1
data['SA'] = np.array([ analize_sentiment(tweet) for tweet in data['Tweets'] ])
display(data.head(200))
I am working on a Project, in this project we are extracting tweets of some of the world leaders and then we will try to compare their relationships with other countries based on their twitter comment. So far we have extracted the tweets from Donald Trump Account We have categorized the tweets into positive and negative but what problem I am facing is how we can separate the tweets country-wise, Is their any way by which only those tweets are extracted in which he/she has tweeted about some country and the rest of the tweets are ignored so that we can only get the tweets related to the country.
I don't have enough reputation to add a comment, but you need to know that you have posted all your access tokens and that is a bad idea.
You might load-up a list of countries such as: github repo by marijn. It also contains a list with nationalities github repo by marijn
Check per tweet whether a name in the list occurs (so you would have to iterate over the list). You might add a counter for each country occuring per tweet. Add this counter-data as a column to your dataframe (similar to your earlier approach to analyze the sentiment).
This is just an idea, I'm not able to comment yet due to the fact I'm new.
Related
I am trying to download tweets based on keywords, and this is the piece of code I am using for that, but the problem is I am unable to download the entire tweet. It is showing only the first 140 characters. How to download the full tweet?
import tweepy
import csv
import pandas as pd
####input your credentials here
consumer_key =''
consumer_secret =''
access_token =''
access_token_secret=''
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)
#####United Airlines
# Open/Create a file to append data
csvFile = open(r'C:\Users\iiit\Desktop\tweets.csv', 'a')
#Use csv Writer
csvWriter = csv.writer(csvFile)
for tweet in tweepy.Cursor(api.search,q="#hashtag",count=1000,
lang="en",
since="2020-04-14").items():
print (tweet.created_at, tweet.text)
csvWriter.writerow([tweet.created_at, tweet.text.encode('utf-8')])
According to official tweepy documentation:
tweepy.API methods that return status objects (tweets) accept a parameter, which can either be compatibility or extended mode. The default parameter is compatibility mode. Compatibility mode truncates tweets to 140 characters. That's what your program is currently doing.
You should use extended mode if you want the entire text. Pass the kwag tweet_mode="extended" to declare the mode you intend to use. Then instead of tweet.text use tweet.full_text
Your new code should look like this;
for tweet in tweepy.Cursor(api.search,
q="#hashtag", count=5, lang="en",
since="2020-05-07", tweet_mode="extended").items():
print (tweet.created_at, tweet.full_text)
csvWriter.writerow([tweet.created_at, tweet.full_text.encode('utf-8')])
I haven't tested, but it should work.
As I know there is a policy that Compatibility mode truncates tweets to 140 characters. But this Status event handler possible to use for a StreamListener prints the full text of the Tweet, or if it’s a Retweet, the full text of the Retweeted Tweet as their documentation.
def on_status(self, status):
if hasattr(status, "retweeted_status"): # Check if Retweet
try:
print(status.retweeted_status.extended_tweet["full_text"])
except AttributeError:
print(status.retweeted_status.text)
else:
try:
print(status.extended_tweet["full_text"])
except AttributeError:
print(status.text)
This may help you.here
As I know there is a policy that Compatibility mode truncates tweets to 140 characters. Tthe full text of the Retweeted Tweet as their documentation.
def on_status(self, status):
if hasattr(status, "retweeted_status"): # Check if Retweet
try:
print(status.retweeted_status.extended_tweet["full_text"])
except AttributeError:
print(status.retweeted_status.text)
else:
try:
print(status.extended_tweet["full_text"])
except AttributeError:
print(status.text)
This may help you.here
CONTEXT: I have a list of tweet ids and their textual content and I need to crawl their metadata. However, my code crawls the tweet metadata and text as well. Since I have about 100K tweet ids I do not wish to waste time crawling the tweet text again.
Question: How can I adapt the following code so I would be able to download only tweet metadata. I'm using tweepy and python 3.6.
def get_tweets_single(twapi, idfilepath):
#tweet_id = '522778758168580098'
tw_list = []
with open(idfilepath,'r') as f1:#A File that Contains tweet IDS
lines = f1.readlines()
for line in lines:
try:
print(line.rstrip('\n'))
tweet = twapi.get_status(line.rstrip('\n'))#tweepy function to crawl tweet metadata
tw_list.append(tweet)
#tweet = twapi.statuses_lookup(id_=tweet_id,include_entities=True, trim_user=True)
with open(idjsonFile,'a',encoding='utf-8')as f2:
json.dump(tweet._json,f2)
except tweepy.TweepError as te:
print('Failed to get tweet ID %s: %s', tweet_id, te.message)
def main(args):
print('hello')
# connect to twitter
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
api = tweepy.API(auth)
get_tweets_single(api, idfilepath)
You cannot only download metadata about the tweet.
Looking at the documentation you can choose to exclude information about the user with trim_user=true - but that's the only thing you can strip out.
actually, I need to do a project on machine learning. In that I want a lot of images for training. I searched for this problem, but I failed to do so.
can anyone help me to solve this. Thanks in advance.
I used google images to download images using selenium. It is just a basic approach.
from selenium import webdriver
import time
import urllib.request
import os
from selenium.webdriver.common.keys import Keys
browser = webdriver.Chrome("path\\to\\the\\webdriverFile")
browser.get("https://www.google.com")
search = browser.find_element_by_name(‘q’)
search.send_keys(key_words,Keys.ENTER) # use required key_words to download images
elem = browser.find_element_by_link_text(‘Images’)
elem.get_attribute(‘href’)
elem.click()
value = 0
for i in range(20):
browser.execute_script(“scrollBy(“+ str(value) +”,+1000);”)
value += 1000
time.sleep(3)
elem1 = browser.find_element_by_id(‘islmp’)
sub = elem1.find_elements_by_tag_name(“img”)
try:
os.mkdir(‘downloads’)
except FileExistsError:
pass
count = 0
for i in sub:
src = i.get_attribute('src')
try:
if src != None:
src = str(src)
print(src)
count+=1
urllib.request.urlretrieve(src,
os.path.join('downloads','image'+str(count)+'.jpg'))
else:
raise TypeError
except TypeError:
print('fail')
if count == required_images_number: ## use number as required
break
check this for detailed explanation.
download driver here
My tip to you is: Use pictures API. This is my favourite: Bing Image Search API
Following text from Send search queries using the REST API and Python.
Running the quickstart
To get started, set subscription_key to a valid subscription key for the Bing API service.
Python
subscription_key = None
assert subscription_key
Next, verify that the search_url endpoint is correct. At this writing, only one endpoint is used for Bing search APIs. If you encounter authorization errors, double-check this value against the Bing search endpoint in your Azure dashboard.
Python
search_url = "https://api.cognitive.microsoft.com/bing/v7.0/images/search"
Set search_term to look for images of puppies.
Python
search_term = "puppies"
The following block uses the requests library in Python to call out to the Bing search APIs and return the results as a JSON object. Observe that we pass in the API key via the headers dictionary and the search term via the params dictionary. To see the full list of options that can be used to filter search results, refer to the REST API documentation.
Python
import requests
headers = {"Ocp-Apim-Subscription-Key" : subscription_key}
params = {"q": search_term, "license": "public", "imageType": "photo"}
response = requests.get(search_url, headers=headers, params=params)
response.raise_for_status()
search_results = response.json()
The search_results object contains the actual images along with rich metadata such as related items. For example, the following line of code can extract the thumbnail URLS for the first 16 results.
Python
thumbnail_urls = [img["thumbnailUrl"] for img in search_results["value"][:16]]
Then use the PIL library to download the thumbnail images and the matplotlib library to render them on a $4 \times 4$ grid.
Python
%matplotlib inline
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
f, axes = plt.subplots(4, 4)
for i in range(4):
for j in range(4):
image_data = requests.get(thumbnail_urls[i+4*j])
image_data.raise_for_status()
image = Image.open(BytesIO(image_data.content))
axes[i][j].imshow(image)
axes[i][j].axis("off")
plt.show()
Sample JSON response
Responses from the Bing Image Search API are returned as JSON. This sample response has been truncated to show a single result.
JSON
{
"_type":"Images",
"instrumentation":{
"_type":"ResponseInstrumentation"
},
"readLink":"images\/search?q=tropical ocean",
"webSearchUrl":"https:\/\/www.bing.com\/images\/search?q=tropical ocean&FORM=OIIARP",
"totalEstimatedMatches":842,
"nextOffset":47,
"value":[
{
"webSearchUrl":"https:\/\/www.bing.com\/images\/search?view=detailv2&FORM=OIIRPO&q=tropical+ocean&id=8607ACDACB243BDEA7E1EF78127DA931E680E3A5&simid=608027248313960152",
"name":"My Life in the Ocean | The greatest WordPress.com site in ...",
"thumbnailUrl":"https:\/\/tse3.mm.bing.net\/th?id=OIP.fmwSKKmKpmZtJiBDps1kLAHaEo&pid=Api",
"datePublished":"2017-11-03T08:51:00.0000000Z",
"contentUrl":"https:\/\/mylifeintheocean.files.wordpress.com\/2012\/11\/tropical-ocean-wallpaper-1920x12003.jpg",
"hostPageUrl":"https:\/\/mylifeintheocean.wordpress.com\/",
"contentSize":"897388 B",
"encodingFormat":"jpeg",
"hostPageDisplayUrl":"https:\/\/mylifeintheocean.wordpress.com",
"width":1920,
"height":1200,
"thumbnail":{
"width":474,
"height":296
},
"imageInsightsToken":"ccid_fmwSKKmK*mid_8607ACDACB243BDEA7E1EF78127DA931E680E3A5*simid_608027248313960152*thid_OIP.fmwSKKmKpmZtJiBDps1kLAHaEo",
"insightsMetadata":{
"recipeSourcesCount":0,
"bestRepresentativeQuery":{
"text":"Tropical Beaches Desktop Wallpaper",
"displayText":"Tropical Beaches Desktop Wallpaper",
"webSearchUrl":"https:\/\/www.bing.com\/images\/search?q=Tropical+Beaches+Desktop+Wallpaper&id=8607ACDACB243BDEA7E1EF78127DA931E680E3A5&FORM=IDBQDM"
},
"pagesIncludingCount":115,
"availableSizesCount":44
},
"imageId":"8607ACDACB243BDEA7E1EF78127DA931E680E3A5",
"accentColor":"0050B2"
}
}
I want to access past three months of twitter data for training a stock prediction model. I want to run a loop to get data on successive dates and I am using Tweepy in python but I don't know how to get the data on a specific date using tweepy. Also I am a bit confused about data I am currently getting. Is it the data on same day or some previous days? This is my code. How should I modify this code to get Twitter data on specific dates? I am new to coding with python. So it would help a lot if you tell me the code.
import sys
import tweepy
import matplotlib.pyplot as plt
def percentage(part, whole):
return 100*float(part)/float(whole)
consumerKey = "aaaaaaaaaaaaaaaaaaaaaaaaa"
consumerSecret = "bbbbbbbbbbbbbbbbbbbbbbbb"
accessToken = "ccccccccccccccc"
accessTokenSecret = "ddddddddddddddddddddddddddddd"
auth = tweepy.OAuthHandler(consumerKey,consumerSecret)
auth.set_access_token(accessToken, accessTokenSecret)
api = tweepy.API(auth)
searchTerm = input("Enter keyword/hashtag to search about: ")
noOfSearchTerms = int(input("Enter how many tweets to analyze: "))
tweets = tweepy.Cursor(api.search, q = searchTerm).items(noOfSearchTerms)
I'm trying to query Facebook for different information, for example - friends list. And it works fine, but of course it only gives limited number of results. How do I access the next batch of results?
import facebook
import json
ACCESS_TOKEN = ''
def pp(o):
with open('facebook.txt', 'a') as f:
json.dump(o, f, indent=4)
g = facebook.GraphAPI(ACCESS_TOKEN)
pp(g.get_connections('me', 'friends'))
The result JSON does give me paging-cursors-before and after values - but where do I put it?
I'm exploring Facebook Graph API through the facepy library for Python (works on Python 3 too), but I think I can help.
TL-DR:
You need to append &after=YOUR_AFTER_CODE to the URL you've called (e.g: https://graph.facebook/v2.8/YOUR_FB_ID/friends/?fields=id,name), giving you a link like this one: https://graph.facebook/v2.8/YOUR_FB_ID/friends/?fields=id,name&after=YOUR_AFTER_CODE, that you should make a GET Request.
You'll need requests in order to make a get request for Graph API using your user ID (I'm assuming you know how to find it programatically) and some url similar to the one I give you below (see URL variable).
import facebook
import json
import requests
ACCESS_TOKEN = ''
YOUR_FB_ID=''
URL="https://graph.facebook.com/v2.8/{}/friends?access_token={}&fields=id,name&limit=50&after=".format(YOUR_FB_ID, ACCESS_TOKEN)
def pp(o):
all_friends = []
if ('data' in o):
for friend in o:
if ('next' in friend['paging']):
resp = request.get(friend['paging']['next'])
all_friends.append(resp.json())
elif ('after' in friend['paging']['cursors']):
new_url = URL + friend['paging']['cursors']['after']
resp = request.get(new_url)
all_friends.append(resp.json())
else:
print("Something went wrong")
# Do whatever you want with all_friends...
with open('facebook.txt', 'a') as f:
json.dump(o, f, indent=4)
g = facebook.GraphAPI(ACCESS_TOKEN)
pp(g.get_connections('me', 'friends'))
Hope this helps!