I want to get as many results as possible for a particular Youtube search query. However, the maximum no. of results that can be retrieved is 50. I know that nextPageToken can be used to retrieve results of next page. How do modify the python code to achieve the same?
#!/usr/bin/python
# original source example: https://developers.google.com/youtube/v3/docs/search/list
# assumes use of Python 3
# This sample executes a search request for the specified search term.
# Sample usage:
# python search.py --q=surfing --max-results=10
# NOTE: To use the sample, you must provide a developer key obtained
# in the Google APIs Console. Search for "REPLACE_ME" in this code
# to find the correct place to provide that key..
import argparse
# library googleapiclient installed with: pip install --upgrade google-api-python-client
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
# Set DEVELOPER_KEY to the API key value from the APIs & auth > Registered apps
# tab of
# https://cloud.google.com/console
# Please ensure that you have enabled the YouTube Data API for your project.
DEVELOPER_KEY = 'KEY'
YOUTUBE_API_SERVICE_NAME = 'youtube'
YOUTUBE_API_VERSION = 'v3'
def youtube_search(query_term, max_results):
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
developerKey=DEVELOPER_KEY)
# Call the search.list method to retrieve results matching the specified
# query term.
search_response = youtube.search().list(
q=query_term,
part='id,snippet',
type='video',
relevanceLanguage='en',
maxResults=max_results
).execute()
video_ids = []
# Add each result to the appropriate list, and then display the lists of
# matching videos, channels, and playlists.
for search_result in search_response.get('items', []):
video_ids.append(search_result['id']['videoId'])
return video_ids
if __name__ == '__main__':
url_prefix = 'https://www.youtube.com/watch?v='
query_terms = '"my_query"'
max_results = 50
try:
ids = youtube_search(query_terms, max_results)
except HttpError as e:
print('An HTTP error %d occurred:\n%s' % (e.resp.status, e.content))
else:
with open('output.txt', 'w') as f:
for i in ids:
f.write(url_prefix+i+"\n")
Here's what that needs to be added to keep fetching results until no nextPageToken is found.
nextPageToken = search_response.get('nextPageToken')
while ('nextPageToken' in search_response):
nextPage = youtube.search().list(
q=query_term,
part='id,snippet',
type='video',
relevanceLanguage='en',
maxResults=max_results,
pageToken=nextPageToken
).execute()
search_response['items'] = search_response['items'] + nextPage['items']
if 'nextPageToken' not in nextPage:
search_response.pop('nextPageToken', None)
else:
nextPageToken = nextPage['nextPageToken']
Related
I am working on a project where I need to create a movie database.
I have created my database and imported the links from IMDB that redirect you to the webpage. I would like to add also, the main image/thumbnail of each movie so that I can use then the csv in Power BI.
However, I did not manage to do it:
I have tried this:
import requests
from bs4 import BeautifulSoup
import numpy as np
images = []
for i in df_database_url['Url Film']:
r = requests.get(i)
soup = BeautifulSoup(r.content, "html.parser")
images.append(image_url)
But my goal is to have a column that includes the thumbnail for each movie.
Assuming that i is an imdb movie url (the kind that starts with https://www.imdb.com/title), you can target the script tag that seems to contain a lot of the main information for the movie - you can get that with
# import json
image_url = json.loads(soup.select_one('script[type="application/ld+json"]').text)['image']
or, if we're more cautious:
# import json
scCont = [s.text for s in soup.select('script[type="application/ld+json"]') if '"image"' in s.text]
if scCont:
try:
scCont = json.loads(scCont[0])
if 'image' not in scCont:
image_url = None
print('No image found for', i)
else: image_url = scCont['image']
except Exception as e:
image_url = None
print('Could not parse movie info for', i, '\n', str(e))
else:
image_url = None
print('Could not find script with movie info for', i)
(and you can get the trailer thumbnail with scCont['trailer']['thumbnailUrl'])
This way, instead of raising an error if anything on the path to the expected info is unavailable, it will just add image_url as None; if you want it to halt and raise error in such cases, use the first version.
and then after the loop you can add in the column with something like
df_database_url['image_urls'] = images
(you probably know that...)
Here is my code:
This whole script worked fine for the first 2-3 times but now is constantly sending 503 responses
The Internet was checked by me multiple times but there wasn't any problem with internet
from bs4 import BeautifulSoup
import requests, sys, os, json
def get_amazon_search_page(search):
search = search.strip().replace(" ", "+")
for i in range(3): # tries to connect and get request the amazon 3 times
try:
print("Searching...")
response = requests.get("https://www.amazon.in/s?k={}&ref=nb_sb_noss".format(search)) # search string will be manipulated by replacing all spaces with "+" in order to search from the website itself
print(response.status_code)
if response.status_code == 200:
return response.content, search
except Exception:
pass
print("Is the search valid for the site: https://www.amazon.in/s?k={}&ref=nb_sb_noss".format(search))
sys.exit(1)
def get_items_from_page(page_content):
print(page_content)
soup = BeautifulSoup(page_content, "html.parser") # soup for extracting information
items = soup.find_all("span", class_ = "a-size-medium a-color-base a-text-normal")
prices = soup.find_all("span", class_ = "a-price-whole")
item_list = []
total_price_of_all = 0
for item, price in zip(items, prices):
dict = {}
dict["Name"] = item.text
dict["Price"] = int(price.text)
total_price_of_all += int(price.text.replace(",", ""))
item_list.append(dict)
average_price = total_price_of_all/len(item_list)
file = open("items.json", "w")
json.dump(item_list, file, indent = 4)
print("Your search results are available in the items.json file")
print("Average prices for the search: {}".format(average_price))
file.close()
def main():
os.system("clear")
print("Note: Sometimes amazon site misbehaves by sending 503 responses, this can be due to heavy traffic on that site, please cooperate\n\n")
search = input("Enter product name: ").strip()
page_content = get_amazon_search_page(search)
get_items_from_page(page_content)
if __name__ == "__main__":
while True:
main()
Please Help !
The server blocks you from scraping it.
If you check the robots.txt, you can see that the link you are trying to request is disallowed:
Disallow: */s?k=*&rh=n*p_*p_*p_
However, a simple way to bypass this blocking would be to change your User-Agent (see here). By default, requests sends something like this "python-requests/2.22.0". Changing it to something more browser-like would temporarily work.
I am trying to "block" X country from the search list, but I can't use an operator such as -"X" also I tried some other operators and techniques but none of that worked. I get this Error :
TypeError: bad operand type for unary -: 'str'
I don't know how can I tell YouTube to say I don't want this X, Y, and Z countries on my search list.
here's my code:
import os
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
scopes = ["https://www.googleapis.com/auth/youtube.force-ssl"]
def main():
# Disable OAuthlib's HTTPS verification when running locally.
# *DO NOT* leave this option enabled in production.
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
api_service_name = "youtube"
api_version = "v3"
client_secrets_file = "YOUR_CLIENT_SECRET_FILE.json"
# Get credentials and create an API client
flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
client_secrets_file, scopes)
credentials = flow.run_console()
youtube = googleapiclient.discovery.build(
api_service_name, api_version, credentials=credentials)
request = youtube.search().list(
part="snippet",
eventType="completed",
order="videoCount",
regionCode="X", # X is a country code as an example
safeSearch="moderate",
topicId="/m/01k8wb",
type="video",
videoDuration="short",
videoLicense="youtube"
)
response = request.execute()
print(response)
if __name__ == "__main__":
main()
It is not possible and, here is the report from Google :
https://issuetracker.google.com/150359410
I am trying to run the script below to extract the tags from a webpage and save them into a csv file.
In details, I want to extract the tags associated to a class name.
However, I come across this error: AttributeError:
'WebElement' object has no attribute 'extract_first'.
The script is the following:
import csv
from selenium import webdriver
from time import sleep
from parsel import Selector
from selenium.webdriver.common.keys import Keys
from collections import defaultdict
from selenium.webdriver.support.select import Select
####### reading from the input file ##########
columns = defaultdict(list) # each value in each column is appended to a list
# get the list of keywords from the csv file
with open('query.csv', 'r') as csvfile:
reader = csv.DictReader(csvfile) # read rows into a dictionary format
for row in reader: # read a row as {column1: value1, column2: value2,...}
for (k, v) in row.items(): # go over each column name and value
columns[k].append(v) # append the value into the appropriate list
# the list containing all of the keywords
search_query_list = columns['Keyword']
########## start scraping ###############
rb_results = []
# create a driver and let it open google chrome
driver = webdriver.Chrome("chromedriver")
# get website
driver.get('https://www.redbubble.com/')
sleep(0.5)
for i in range(len(search_query_list)):
next_query = search_query_list[i]
# get RB website
driver.get('https://www.redbubble.com/')
# get the search by its id
search_bar = driver.find_element_by_name("query")
sleep(0.5)
# enter the query to the search bar
search_bar.send_keys(next_query)
# press enter
search_bar.send_keys(Keys.RETURN)
sleep(1)
# from parsel's selector get the page source
sel1 = Selector(text=driver.page_source)
sleep(0.5)
# get first shirt //
continue_link = driver.find_element_by_class_name('shared-components-ShopSearchSkeleton-ShopSearchSkeleton__composedComponentWrapper--1s_CI').click()
sleep(1)
sel2 = Selector(text=driver.page_source)
sleep(0.5)
################## get TAGS ###############
# Check tags for all products
try:
# get the tags for the search query
tags_rb = driver.find_element_by_class_name("shared-components-Tags-Tags__listContent--oLdDf").extract_first()
tags_rb = str(tags_rb)
# if number of products is found print it and search for the prime
# print the number of products found
if tags_rb == None:
rb_results.append("0")
else:
#rb_results = str(tags_rb)
rb_results.append(tags_rb)
except ValueError:
pass
#except:
#rb_results.append("errore")
###### writing part ########
with open ("rb_results.csv","w", newline='') as resultFile:
writer = csv.DictWriter(resultFile, fieldnames=["Rb Results"],delimiter='\t')
writer.writeheader()
writer.writerows({'Rb Results': item} for item in rb_results)
resultFile.close()
Any ideas about how to fix it and extract the text of shared-components-Tags-Tags__listContent--oLdDf ? Many thanks!!!
If I right understand, you want an element text. So you can do it like this:
replace:
tags_rb = driver.find_element_by_class_name("shared-components-Tags-Tags__listContent--oLdDf").extract_first()
with:
tags_rb = driver.find_element_by_class_name("shared-components-Tags-Tags__listContent--oLdDf").text
You are getting this error:
'WebElement' object has no attribute 'extract_first'.
because WebElement does not have method .extract_first().
PS: you don't need this:
tags_rb = str(tags_rb)
The code block to replace is:
# Check tags for all products
try:
# get the tags for the search query
tags_rb = driver.find_element_by_class_name("shared-components-Tags-Tags__listContent--oLdDf").text # get text
# tags_rb = str(tags_rb) no need in this line
# if number of products is found print it and search for the prime
# print the number of products found
if tags_rb == None:
rb_results.append("0")
else:
#rb_results = str(tags_rb)
rb_results.append(tags_rb)
except ValueError:
pass
I am trying to run the Twitter Streaming API to collect only tweets from a certain country.
Trying to write long/lat coordinates as a filter gives the syntax error:
positional argument follows keyword argument
I'm willing to use the geotag to filter by country or city, but have no idea what needs to be written into the code to do that.
A solution to stream only tweets from an approximate location would be great.
# Import the necessary methods from tweepy library
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
# Variables that contain the user credentials to access Twitter API
access_token = "put your keys here"
access_token_secret = "puts your keys here"
consumer_key = "puts your keys here"
consumer_secret = "puts your keys here"
#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
print(data)
return True
def on_error(self, status):
print(status)
if __name__ == '__main__':
#This handles Twitter authentification and the connection to Twitter
Streaming API
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
You need to supply the bounding box for your location in latitude and longitude. Something like this,
stream.filter(locations=[-74,40,-73,41])
Here is Twitter's documentation.