How to get all data certain website with Scrapy? - python-3.x

I'm currently learn scraping using scrapy, so I want to scrape data from this: https://www.espn.com/nba/stats/player or https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2
if you go to the link you will see the show more button at the bottom data so im confuse about this because if i scrape that right now im only get 50 of data that, not what i want, so i decided to look up the show more button but its only a href=#
[UPDATE] USING Scrapy+playwright
def start_requests(self):
yield scrapy.Request(
url='https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2',
meta= dict(
playwright = True,
playwright_include_page=True,
playwright_page_coroutines = [
PageMethod('wait_for_selector','//a[#class="AnchorLink loadMore__link"]'),
PageMethod('click','//a[#class="AnchorLink loadMore__link"]'),
]
),
callback=self.parse,
)
async def parse(self, response):
page = response.meta['playwright_page']
button = response.meta['playwright_page_coroutines'][0]
if button:
await button.click()
resp = response.body
player_list = sel.xpath(
"//table[#class='Table Table--align-right Table--fixed Table--fixed-left']//tbody//tr")
stats_list = sel.xpath(
"//div[#class='Table__ScrollerWrapper relative overflow-hidden']/div[#class='Table__Scroller']/table/tbody/tr")
await page.wait_for_selector(player_list)
sel = Selector(text=resp)
for player, stat in zip(player_list, stats_list):
player_name = player.xpath(".//a/text()").get()
position = stat.xpath(".//td/div/text()").get()
team_name = player.xpath(".//span/text()").get()
game_played = stat.xpath(".//td[2]/text()").get()
minutes_per_minute = stat.xpath(".//td[3]/text()").get()
points_per_game = stat.xpath(".//td[4]/text()").get()
fields_goal_made = stat.xpath(".//td[5]/text()").get()
fields_goal_attempted = stat.xpath(".//td[6]/text()").get()
field_goal_percentage = stat.xpath(".//td[7]/text()").get()
three_point_goal_made = stat.xpath(".//td[8]/text()").get()
yield {
"player_name": player_name,
"player_position": position,
"team_name": team_name,
"game_played": game_played,
"minutes_per_minute": minutes_per_minute,
"points_per_game": points_per_game,
"fields_goal_made": fields_goal_made,
"fields_goal_attempted": fields_goal_attempted,
"field_goal_percentage": field_goal_percentage,
"three_point_goal_made": three_point_goal_made,
}
When only using scrapy
def start_requests(self):
yield scrapy.Request(
url='https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2',
callback=self.parse,
)
def parse(self, response):
sel = Selector(text=response.body)
player_list = sel.xpath("//table[#class='Table Table--align-right Table--fixed Table--fixed-left']//tbody//tr")
stats_list = sel.xpath("//div[#class='Table__ScrollerWrapper relative overflow-hidden']/div[#class='Table__Scroller']/table/tbody/tr")
for player,stat in zip(player_list,stats_list):
player_name = player.xpath(".//a/text()").get()
position = stat.xpath(".//td/div/text()").get()
team_name = player.xpath(".//span/text()").get()
game_played = stat.xpath(".//td[2]/text()").get()
minutes_per_minute= stat.xpath(".//td[3]/text()").get()
points_per_game = stat.xpath(".//td[4]/text()").get()
fields_goal_made = stat.xpath(".//td[5]/text()").get()
fields_goal_attempted = stat.xpath(".//td[6]/text()").get()
field_goal_percentage = stat.xpath(".//td[7]/text()").get()
three_point_goal_made = stat.xpath(".//td[8]/text()").get()
yield {
"player_name": player_name,
"player_position":position,
"team_name": team_name,
"game_played": game_played,
"minutes_per_minute": minutes_per_minute,
"points_per_game": points_per_game,
"fields_goal_made":fields_goal_made,
"fields_goal_attempted":fields_goal_attempted,
"field_goal_percentage":field_goal_percentage,
"three_point_goal_made":three_point_goal_made,
}
Am i doing it wrong here? also if you click show more it will show the api like down bellow, i can scrape from that api but for now i want it from html with xpath it self.

Related

AttributeError: Response content isn't text. What is the problem?

I am using python3.8.5, scrapy2.4.0 I am also using scrapy-proxy-pool and scrapy-user-agents I am getting "AttributeError: Response content isn't text". I am running this code on python3-venv. Would you like to help me explaining and solving the problem ?
Here is my code:
import scrapy
import json
class BasisMembersSpider(scrapy.Spider):
name = 'basis'
allowed_domains = ['www.basis.org.bd']
def start_requests(self):
start_url = 'https://basis.org.bd/get-member-list?page=1&team='
yield scrapy.Request(url=start_url, callback=self.get_membership_no)
def get_membership_no(self, response):
data_array = json.loads(response.body)['data']
next_page = json.loads(response.body)['links']['next']
for data in data_array:
next_url = 'https://basis.org.bd/get-company-profile/{0}'.format(data['membership_no'])
yield scrapy.Request(url=next_url, callback=self.parse)
if next_page:
yield scrapy.Request(url=next_page, callback=self.get_membership_no)
def parse(self, response):
print("Printing informations....................................................")
Here is my settings.py file:
BOT_NAME = 'web_scraping'
SPIDER_MODULES = ['web_scraping.spiders']
NEWSPIDER_MODULE = 'web_scraping.spiders'
AUTOTHROTTLE_ENABLED = True
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'web_scraping (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
PROXY_POOL_ENABLED = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 800,
'scrapy_proxy_pool.middlewares.ProxyPoolMiddleware': 610,
'scrapy_proxy_pool.middlewares.BanDetectionMiddleware': 620,
}
And are error messages from console output:
Thank you...

youtube API get all playlist id from a channel : python

I'm trying to have a list of all playlists from a specific channel in youtube API ... in python
I'd like to have a list of playlist_id in a array
all_playlist_item = []
if I launch
https://www.googleapis.com/youtube/v3/playlists?part=snippet&channelId=UC1udnO-W6gpR9qzleJ5SDKw&key=xxxxxx
here's the response from this request
{
"kind": "youtube#playlistListResponse",
"etag": "PMwo-9BIp7p_L2ynH9sFOGIOido",
"nextPageToken": "CAUQAA",
"pageInfo": {
"totalResults": 17,
"resultsPerPage": 5
},
"items": [
{
"kind": "youtube#playlist",
"etag": "-V67IpyB9a1JGDGJ4pVQnEoMRy4",
"id": "PLWi7PxnyAMeN1tb-ldDzJcnJy2yd5wYrO",
"snippet": {
...
I thing i have to used nextPageToken, but I don't know how to code this function for playlist items
Here's my code (extract in excel all videos just from a specific playlist)
channel_name="UC1udnO-W6gpR9qzleJ5SDKw"
playlist_name="PLWi7PxnyAMeOJmVLv8Z_N3edNyipsnHbo"
api_key = "xxxxxx"
from apiclient.discovery import build
import pandas as pd
youtube = build('youtube', 'v3', developerKey=api_key)
def get_channel_videos(channel_id):
# get Uploads playlist id
res = youtube.channels().list(id=channel_id,
part='contentDetails').execute()
playlist_id = res['items'][0]['contentDetails']['relatedPlaylists']['uploads']
videos = []
next_page_token = None
while 1:
res = youtube.playlistItems().list(playlistId=playlist_name,
part='snippet',
maxResults=50,
pageToken=next_page_token).execute()
videos += res['items']
next_page_token = res.get('nextPageToken')
if next_page_token is None:
break
return videos
videos = get_channel_videos(channel_name)
def get_videos_stats(video_ids):
stats = []
for i in range(0, len(video_ids), 50):
res = youtube.videos().list(id=','.join(video_ids[i:i+50]),
part='statistics').execute()
stats += res['items']
return stats
video_ids = list(map(lambda x:x['snippet']['resourceId']['videoId'], videos))
stats = get_videos_stats(video_ids)
d = []
if len(stats)!=len(videos):
i=1
j=0
else:
i=0
j=0
len_video = len(videos)
len_stats = len(stats)
for video in videos:
if i >= len_video:
break
Url_video='https://www.youtube.com/watch?v='+videos[i]['snippet']['resourceId']['videoId']+'&list='+playlist_name
d.append((videos[i]['snippet']['title'],
videos[i]['snippet']['resourceId']['videoId'],
Url_video,
stats[j]['statistics']['viewCount'],
stats[j]['statistics']['likeCount'],
stats[j]['statistics']['dislikeCount']
))
i+=1
j+=1
df=pd.DataFrame(d, columns=('Titre_video', 'ID_video', 'Url_video','vues','like','dislike'))
df['vues'] = df['vues'].astype(int)
df['like'] = df['like'].astype(int)
df['dislike'] = df['dislike'].astype(int)
df.index+=1
df.to_excel("youtube-playlist.xlsx")
From the pagination doc, you can use :
youtube.playlistItems().list_next(request, response) for iterating playlist item response
youtube.playlists().list_next(request, response) for iterating channel response
Get all videos from Playlist
import googleapiclient.discovery
playlist_id = "PLWi7PxnyAMeOJmVLv8Z_N3edNyipsnHbo"
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey = "YOUR_API_KEY")
request = youtube.playlistItems().list(
part = "snippet",
playlistId = playlist_id,
maxResults = 50
)
response = request.execute()
playlist_items = []
while request is not None:
response = request.execute()
playlist_items += response["items"]
request = youtube.playlistItems().list_next(request, response)
print(f"total: {len(playlist_items)}")
print(playlist_items)
Get all playlists from channel
import googleapiclient.discovery
channel_id = "UC1udnO-W6gpR9qzleJ5SDKw"
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey = "YOUR_API_KEY")
request = youtube.playlists().list(
part = "snippet",
channelId = channel_id,
maxResults = 50
)
response = request.execute()
playlists = []
while request is not None:
response = request.execute()
playlists += response["items"]
request = youtube.playlists().list_next(request, response)
print(f"total: {len(playlists)}")
print(playlists)

How to scrape headings in about page?

I'm trying to scrape the headings in about page but I tried so much and failed due to not the proper understanding of what to do? I'm a beginner. So I require help.
import scrapy
from ..items import DmoztutorialItem
class DmozSpiderSpider(scrapy.Spider):
name = 'Dmoz'
start_urls = [
'http://dmoz-odp.org/',
]
def parse(self, response):
items = DmoztutorialItem()
Navbar = response.css('#main-nav a::text').extract()
Category_names = response.css('.top-cat a::text').extract()
Subcategories = response.css('.sub-cat a::text').extract()
items['Navbar'] = Navbar
items['Category_names'] = Category_names
items['Subcategories'] = Subcategories
yield items
# Nav_page = response.css('#main-nav a::attr(href)').extract()
Nav_page = 'http://dmoz-odp.org/docs/en/about.html'.extract()
# About_heading = response.css('h1+ p , #mainContent
# h1::text').extract()
items['Nav_page'] = Nav_page
# items['About_heading'] = About_heading
yield response.follow(Nav_page)
Can you tell what kind of output do you need? It is very unclear from your post.
Check this example, where you can:
Get some data;
Call request to another page with date saved;
Yield final data.
Hope it will help you.
import scrapy
class DmozSpiderSpider(scrapy.Spider):
name = 'Dmoz'
start_urls = ['http://dmoz-odp.org/']
nav_page = 'http://dmoz-odp.org/docs/en/about.html'
def parse(self, response):
# collect data on first page
items = {
'Navbar': response.css('#main-nav a::text').extract(),
'Category_names': response.css('.top-cat a::text').extract(),
'Subcategories': response.css('.sub-cat a::text').extract(),
'Nav_page': self.nav_page,
}
# save and call request to another page
yield response.follow(self.nav_page, self.parse_nav, meta={'items': items})
def parse_nav(self, response):
# do you stuff on second page
items = response.meta['items']
items['something'] = 'something' # add your logics
yield items
Or make separate logins for separate pages:
import scrapy
class DmozSpiderSpider(scrapy.Spider):
name = 'Dmoz'
def start_requests(self):
reqs = (
('http://dmoz-odp.org/', self.parse_main),
('http://dmoz-odp.org/docs/en/about.html', self.parse_nav),
)
for link, callback in reqs:
yield scrapy.Request(link, callback)
def parse_main(self, response):
items = {
'Navbar': response.css('#main-nav a::text').extract(),
'Category_names': response.css('.top-cat a::text').extract(),
'Subcategories': response.css('.sub-cat a::text').extract(),
}
yield items
def parse_nav(self, response):
items = {
'something': 'something', # add your logics
}
yield items
To parse a different HTML page, you need to yield a Request object with the target URL as the first argument for its constructor, and do the parsing in the method of your spider that you pass to the constructor of that Request object as the callback parameter.
I strongly encourage you to complete the Scrapy tutorial. What you are trying to achieve is covered in the Following links section.

How to get a Scrapy request to go to the next page?

I need to parse the country code of each comment in my web page then store it in a json file, but I am having an issue when I try to turn to the next page.
I'm not sure whether I used the correct way to send the request.
Here's my code:
index = 1
def parse_fb(self, response):
data = response.body
soup = BeautifulSoup(data, "html.parser")
with open(ArticlesSpider.pro_id+'.json', 'a+') as f:
user_country = soup.find_all('div', class_='user-country')
for i in range(len(user_country)):
code = str(user_country[i])
code = code.split('">')
code = str(code[2])
code = code.split('</b>')
code = code[0]
json.dump(code, f)
print(code)
request_url='https://feedback.aliexpress.com/display/productEvaluation.htm'
data = {
'ownerMemberId': '',
'memberType':'seller',
'productId': str(ArticlesSpider.pro_id),
'companyId': '',
'evaStarFilterValue': 'all Stars',
'evaSortValue': 'sortdefault#feedback',
'page': str(index),
'currentPage': '',
'startValidDate': '',
'i18n': 'false',
'withPictures': 'false',
'withPersonalInfo': 'false',
'withAdditionalFeedback': 'false',
'onlyFromMyCountry': 'false',
'version': 'evaNlpV1_2',
'isOpened': 'true',
'translate': 'Y',
'jumpToTop':'false',
'${csrfToken.parameterName}': '${csrfToken.token}',
}
index += 1
yield scrapy.FormRequest(request_url,formdata=data,callback=self.parse_fb)
Why do you need BeautifulSoup? All this is superfluous.
Here is the working code for your product:
import scrapy
class CodeInfo(scrapy.Item):
code = scrapy.Field()
class feedback_aliexpress_com(scrapy.Spider):
name = 'feedback_aliexpress_com'
domain = 'feedback.aliexpress.com'
allowed_domains = ['feedback.aliexpress.com']
start_urls = ['https://feedback.aliexpress.com/display/productEvaluation.htm?' +
'productId=32911361727&ownerMemberId=206054366&companyId=&memberType=seller&startValidDate=']
url = 'https://feedback.aliexpress.com/display/productEvaluation.htm'
page = 1
def parse(self, response):
code = CodeInfo()
if response.css('.user-country'):
for listing in response.css('.feedback-item'):
code['code'] = listing.css('.user-country > b::text').extract_first()
yield code
self.page += 1
self.url = 'https://feedback.aliexpress.com/display/productEvaluation.htm?productId=32911361727&ownerMemberId=206054366&page=' \
+ str(self.page)
yield response.follow(url=self.url, callback=self.parse)
A lot of excess))) I know))) check it out) did in a hurry
Well, you are changing index, but not using it: your request_url is the same during the process. If this bit is the one that you are expecting to change the page
yield scrapy.FormRequest(request_url,formdata=data,callback=self.parse_fb)
than you have to change request_url before calling that.

File Upload To Confluence With Groovy?

I want upload a file with a Groovy script to Confluence.
As this Pythonscript example!
I started to translate the code into groovy,
// Groovy
def server = new XMLRPCServerProxy("http://confluence:8090/rpc/xmlrpc")
def spaceKey = "Area"
def pageTitel = "FileUpload"
def contentType = "application/pdf"
def token = server.confluence2.login("UserName","Password")
def page = server.confluence2.getPage(token, spaceKey, pageTitel)
def fileName = "D:\\datamodel.pdf"
def file = new File (fileName)
//
//Up to this point it works!!!
but i found nothing in groovy for the last steps!
//Python Script from Examplelink above
//.....
attachment = {};
attachment['fileName'] = os.path.basename(filename);
attachment['contentType'] = contentType;
server.confluence1.addAttachment(token, page['id'], attachment, xmlrpclib.Binary(data));
I think, i must have an object for the attachments and a method to store the attachment on the given page in the server.
FINAL WORKING CODE
def server = new XMLRPCServerProxy("http://confluence:8090/rpc/xmlrpc")
def spaceKey = "Area"
def pageTitel = "FileUpload"
def fileName = "D:\\datamodel.pdf"
def contentType = "application/pdf"
def token = server.confluence2.login("UserName" , "Password")
def page = server.confluence2.getPage(token, spaceKey, pageTitel)
def file = new File (fileName)
server.confluence2.addAttachment( token, page.id, [ fileName: file.name, contentType:contentType ], file.bytes )
Looking at the docs, it looks like you should be able to do:
server.confluence2.addAttachment( token,
page.id,
[ fileName: file.name,
contentType:'application/pdf' ],
file.bytes )

Resources