How to get a Scrapy request to go to the next page? - python-3.x

I need to parse the country code of each comment in my web page then store it in a json file, but I am having an issue when I try to turn to the next page.
I'm not sure whether I used the correct way to send the request.
Here's my code:
index = 1
def parse_fb(self, response):
data = response.body
soup = BeautifulSoup(data, "html.parser")
with open(ArticlesSpider.pro_id+'.json', 'a+') as f:
user_country = soup.find_all('div', class_='user-country')
for i in range(len(user_country)):
code = str(user_country[i])
code = code.split('">')
code = str(code[2])
code = code.split('</b>')
code = code[0]
json.dump(code, f)
print(code)
request_url='https://feedback.aliexpress.com/display/productEvaluation.htm'
data = {
'ownerMemberId': '',
'memberType':'seller',
'productId': str(ArticlesSpider.pro_id),
'companyId': '',
'evaStarFilterValue': 'all Stars',
'evaSortValue': 'sortdefault#feedback',
'page': str(index),
'currentPage': '',
'startValidDate': '',
'i18n': 'false',
'withPictures': 'false',
'withPersonalInfo': 'false',
'withAdditionalFeedback': 'false',
'onlyFromMyCountry': 'false',
'version': 'evaNlpV1_2',
'isOpened': 'true',
'translate': 'Y',
'jumpToTop':'false',
'${csrfToken.parameterName}': '${csrfToken.token}',
}
index += 1
yield scrapy.FormRequest(request_url,formdata=data,callback=self.parse_fb)

Why do you need BeautifulSoup? All this is superfluous.
Here is the working code for your product:
import scrapy
class CodeInfo(scrapy.Item):
code = scrapy.Field()
class feedback_aliexpress_com(scrapy.Spider):
name = 'feedback_aliexpress_com'
domain = 'feedback.aliexpress.com'
allowed_domains = ['feedback.aliexpress.com']
start_urls = ['https://feedback.aliexpress.com/display/productEvaluation.htm?' +
'productId=32911361727&ownerMemberId=206054366&companyId=&memberType=seller&startValidDate=']
url = 'https://feedback.aliexpress.com/display/productEvaluation.htm'
page = 1
def parse(self, response):
code = CodeInfo()
if response.css('.user-country'):
for listing in response.css('.feedback-item'):
code['code'] = listing.css('.user-country > b::text').extract_first()
yield code
self.page += 1
self.url = 'https://feedback.aliexpress.com/display/productEvaluation.htm?productId=32911361727&ownerMemberId=206054366&page=' \
+ str(self.page)
yield response.follow(url=self.url, callback=self.parse)
A lot of excess))) I know))) check it out) did in a hurry

Well, you are changing index, but not using it: your request_url is the same during the process. If this bit is the one that you are expecting to change the page
yield scrapy.FormRequest(request_url,formdata=data,callback=self.parse_fb)
than you have to change request_url before calling that.

Related

How to get all data certain website with Scrapy?

I'm currently learn scraping using scrapy, so I want to scrape data from this: https://www.espn.com/nba/stats/player or https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2
if you go to the link you will see the show more button at the bottom data so im confuse about this because if i scrape that right now im only get 50 of data that, not what i want, so i decided to look up the show more button but its only a href=#
[UPDATE] USING Scrapy+playwright
def start_requests(self):
yield scrapy.Request(
url='https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2',
meta= dict(
playwright = True,
playwright_include_page=True,
playwright_page_coroutines = [
PageMethod('wait_for_selector','//a[#class="AnchorLink loadMore__link"]'),
PageMethod('click','//a[#class="AnchorLink loadMore__link"]'),
]
),
callback=self.parse,
)
async def parse(self, response):
page = response.meta['playwright_page']
button = response.meta['playwright_page_coroutines'][0]
if button:
await button.click()
resp = response.body
player_list = sel.xpath(
"//table[#class='Table Table--align-right Table--fixed Table--fixed-left']//tbody//tr")
stats_list = sel.xpath(
"//div[#class='Table__ScrollerWrapper relative overflow-hidden']/div[#class='Table__Scroller']/table/tbody/tr")
await page.wait_for_selector(player_list)
sel = Selector(text=resp)
for player, stat in zip(player_list, stats_list):
player_name = player.xpath(".//a/text()").get()
position = stat.xpath(".//td/div/text()").get()
team_name = player.xpath(".//span/text()").get()
game_played = stat.xpath(".//td[2]/text()").get()
minutes_per_minute = stat.xpath(".//td[3]/text()").get()
points_per_game = stat.xpath(".//td[4]/text()").get()
fields_goal_made = stat.xpath(".//td[5]/text()").get()
fields_goal_attempted = stat.xpath(".//td[6]/text()").get()
field_goal_percentage = stat.xpath(".//td[7]/text()").get()
three_point_goal_made = stat.xpath(".//td[8]/text()").get()
yield {
"player_name": player_name,
"player_position": position,
"team_name": team_name,
"game_played": game_played,
"minutes_per_minute": minutes_per_minute,
"points_per_game": points_per_game,
"fields_goal_made": fields_goal_made,
"fields_goal_attempted": fields_goal_attempted,
"field_goal_percentage": field_goal_percentage,
"three_point_goal_made": three_point_goal_made,
}
When only using scrapy
def start_requests(self):
yield scrapy.Request(
url='https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2',
callback=self.parse,
)
def parse(self, response):
sel = Selector(text=response.body)
player_list = sel.xpath("//table[#class='Table Table--align-right Table--fixed Table--fixed-left']//tbody//tr")
stats_list = sel.xpath("//div[#class='Table__ScrollerWrapper relative overflow-hidden']/div[#class='Table__Scroller']/table/tbody/tr")
for player,stat in zip(player_list,stats_list):
player_name = player.xpath(".//a/text()").get()
position = stat.xpath(".//td/div/text()").get()
team_name = player.xpath(".//span/text()").get()
game_played = stat.xpath(".//td[2]/text()").get()
minutes_per_minute= stat.xpath(".//td[3]/text()").get()
points_per_game = stat.xpath(".//td[4]/text()").get()
fields_goal_made = stat.xpath(".//td[5]/text()").get()
fields_goal_attempted = stat.xpath(".//td[6]/text()").get()
field_goal_percentage = stat.xpath(".//td[7]/text()").get()
three_point_goal_made = stat.xpath(".//td[8]/text()").get()
yield {
"player_name": player_name,
"player_position":position,
"team_name": team_name,
"game_played": game_played,
"minutes_per_minute": minutes_per_minute,
"points_per_game": points_per_game,
"fields_goal_made":fields_goal_made,
"fields_goal_attempted":fields_goal_attempted,
"field_goal_percentage":field_goal_percentage,
"three_point_goal_made":three_point_goal_made,
}
Am i doing it wrong here? also if you click show more it will show the api like down bellow, i can scrape from that api but for now i want it from html with xpath it self.

How to call a loop under main fuction python

I am working on a script which i have to modify in order loop through the multiple resources within a functions.
Below are the items which we need to loop through to get the data from and, this is coming from Config_local
BASE_URL = "https://synergy.hpe.example.com/rest/"
RES_EXT = [ 'resource-alerts?count=500&start=1',
'resource-alerts?count=500&start=501'
'resource-alerts?count=500&start=1001,
'resource-alerts?count=500&start=1501'
]
While i am looping through above list under def main(): section and taking get_resource_alerts_response() under loop the data coming out of the loop getting over-written and thus returning only last loop data only.
Main Script:
import os
import shutil
import smtplib
from email.message import EmailMessage
import pandas as pd
pd.set_option('expand_frame_repr', True)
import requests
from Config_local import (
BASE_URL,
DST_DIR,
OUTFILE,
PASSWORD,
SRC_DIR,
TIMESTAMP_DST,
USERNAME,
SUBJECT,
FROM,
TO,
EMAIL_TEMPLATE,
SMTP_SERVER,
RES_EXT,
)
class FileMoveFailure(Exception):
pass
class SynergyRequestFailure(Exception):
pass
class SessionIdRetrievalFailure(SynergyRequestFailure):
pass
class ResourceAlertsRetrievalFailure(SynergyRequestFailure):
pass
def move_csv_files():
for csv_file in os.listdir(SRC_DIR):
if csv_file.endswith(".csv") and os.path.isfile(os.path.join(SRC_DIR, csv_file)):
try:
shutil.move(
os.path.join(f"{SRC_DIR}/{csv_file}"),
f"{DST_DIR}/{csv_file}-{TIMESTAMP_DST}.log"
)
except OSError as os_error:
raise FileMoveFailure(
f'Moving file {csv_file} has failed: {os_error}'
)
def get_session_id(session):
try:
response = session.post(
url=f"{BASE_URL}/login-sessions",
headers={
"accept": "application/json",
"content-type": "application/json",
"x-api-version": "120",
},
json={
"userName": USERNAME,
"password": PASSWORD
},
verify=False
)
except requests.exceptions.RequestException as req_exception:
# you should also get this logged somewhere, or at least
# printed depending on your use case
raise SessionIdRetrievalFailure(
f"Could not get session id: {req_exception}"
)
json_response = response.json()
if not json_response.get("sessionID"):
# always assume the worse and do sanity checks & validations
# on fetched data
raise KeyError("Could not fetch session id")
return json_response["sessionID"]
#def get_all_text(session, session_id):
# all_text = ''
# for res in RES_EXT:
# url= f"{BASE_URL}{res}"
# newresult = get_resource_alerts_response(session, session_id, url)
# all_text += newresult
# print(f"{all_text}")
# return str(all_text)
#
def get_resource_alerts_response(session, session_id, res):
try:
return session.get(
url=f"{BASE_URL}{res}",
headers={
"accept": "application/json",
"content-type": "text/csv",
"x-api-version": "2",
"auth": session_id,
},
verify=False,
stream=True
)
except requests.exceptions.RequestException as req_exception:
# you should also get this logged somewhere, or at least
# printed depending on your use case
raise ResourceAlertsRetrievalFailure(
f"Could not fetch resource alerts: {req_exception}"
)
def resource_alerts_to_df(resource_alerts_response):
with open(OUTFILE, 'wb') as f:
for chunk in resource_alerts_response.iter_content(chunk_size=1024*36):
f.write(chunk)
return pd.read_csv(OUTFILE)
def send_email(df):
server = smtplib.SMTP(SMTP_SERVER)
msg = EmailMessage()
msg['Subject'], msg['From'], msg['To'] = SUBJECT, FROM, TO
msg.set_content("Text version of your html template")
msg.add_alternative(
EMAIL_TEMPLATE.format(df.to_html(index=False)),
subtype='html'
)
server.send_message(msg)
def main():
move_csv_files()
session = requests.Session()
session_id = get_session_id(session)
for res in RES_EXT:
resource_alerts_response = get_resource_alerts_response(session,
session_id, res)
print(resource_alerts_response)
df = resource_alerts_to_df(resource_alerts_response)
print(df)
send_email(df)
if __name__ == '__main__':
main()
any help or hint will be much appreciated.
This is a copy of the code which I recall we had over SO but not what you want now, However, as the whole code body is okay and the idea of for loop is also looks good, you Just need to tweek it to meet the requirement.
1- You need to create and empty DataFrame assignment Synergy_Data = pd.DataFrame()
2- then you can append the data you received from for loop ie resource_alerts_response which becomes df = resource_alerts_to_df(resource_alerts_response)
3- lastly, you can append this df to the empty Synergy_Data and then call that under your if __name__ == '__main__' to send an e-mail. Also don't forget to declare Synergy_Data as a global variable.
Synergy_Data = pd.DataFrame()
def main():
global Synergy_Data
move_csv_files()
session = requests.Session()
session_id = get_session_id(session)
for res in RES_EXT:
resource_alerts_response = get_resource_alerts_response(session,
session_id, res)
df = resource_alerts_to_df(resource_alerts_response)
Synergy_Data = Synergy_Data.append(df)
if __name__ == '__main__':
main()
send_email(Synergy_Data)
Hope this will be helpful.
Only the last response is being used because this value is overwritten in resource_alerts_response on each iteration of the loop. You may consider acting on the data on each iteration or storing it for use later i.e. after the loop. I've included these options with modifications to the main() function below.
Option 1
Send an email for each resource alert response
def main():
move_csv_files()
session = requests.Session()
session_id = get_session_id(session)
for res in RES_EXT:
resource_alerts_response = get_resource_alerts_response(session,
session_id, res)
print(resource_alerts_response)
# Indent lines below so that the operations below are executed in each loop iteration
df = resource_alerts_to_df(resource_alerts_response)
print(df)
send_email(df)
Option 2
Merge all resource alert responses and send one email
def main():
move_csv_files()
session = requests.Session()
session_id = get_session_id(session)
df_resource_alerts_responses = None
for res in RES_EXT:
resource_alerts_response = get_resource_alerts_response(session,
session_id, res)
print(resource_alerts_response)
df = resource_alerts_to_df(resource_alerts_response)
if df_resource_alerts_responses is None:
df_resource_alerts_responses = df
else:
df_resource_alerts_responses = df_resource_alerts_responses.append(df, ignore_index=True)
print(df_resource_alerts_responses)
if df_resource_alerts_responses is not None:
send_email(df_resource_alerts_responses)

AttributeError: Response content isn't text. What is the problem?

I am using python3.8.5, scrapy2.4.0 I am also using scrapy-proxy-pool and scrapy-user-agents I am getting "AttributeError: Response content isn't text". I am running this code on python3-venv. Would you like to help me explaining and solving the problem ?
Here is my code:
import scrapy
import json
class BasisMembersSpider(scrapy.Spider):
name = 'basis'
allowed_domains = ['www.basis.org.bd']
def start_requests(self):
start_url = 'https://basis.org.bd/get-member-list?page=1&team='
yield scrapy.Request(url=start_url, callback=self.get_membership_no)
def get_membership_no(self, response):
data_array = json.loads(response.body)['data']
next_page = json.loads(response.body)['links']['next']
for data in data_array:
next_url = 'https://basis.org.bd/get-company-profile/{0}'.format(data['membership_no'])
yield scrapy.Request(url=next_url, callback=self.parse)
if next_page:
yield scrapy.Request(url=next_page, callback=self.get_membership_no)
def parse(self, response):
print("Printing informations....................................................")
Here is my settings.py file:
BOT_NAME = 'web_scraping'
SPIDER_MODULES = ['web_scraping.spiders']
NEWSPIDER_MODULE = 'web_scraping.spiders'
AUTOTHROTTLE_ENABLED = True
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'web_scraping (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
PROXY_POOL_ENABLED = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 800,
'scrapy_proxy_pool.middlewares.ProxyPoolMiddleware': 610,
'scrapy_proxy_pool.middlewares.BanDetectionMiddleware': 620,
}
And are error messages from console output:
Thank you...

python requests error check-mk API

I am trying to add dictionary data to our check-mk Web API wit Python requests, but I keep getting an Error of missing keys:
{"result": "Check_MK exception: Missing required key(s): aux_tags, tag_groups", "result_code": 1}
Here is my code:
import json
import requests
params_get = (
('action', 'get_hosttags'),
('_username', 'user'),
('_secret', 'secret'),
('request_format', 'json'),
)
params_set = (
('action', 'set_hosttags'),
('_username', 'user'),
('_secret', 'secret'),
('request_format', 'json'),
)
url = 'http://monitoringtest.local.domain/test/check_mk/webapi.py'
tags = ['tag1', 'tag2']
response_data = requests.get(url, params=params_get)
data = response_data.json()
new_data_tags = data['result']['tag_groups']
new_data = data['result']
# new_tags = []
for tag in tags:
new_data['aux_tags'].append({'id': tag, 'title': 'tags in datacenter'})
# new_tags.extend([{'aux_tags': [], 'id': tag, 'title': tag.upper() + ' Tag'}])
# all_tags = new_data_tags.extend([{'tags': new_tags, 'id': 'group1', 'title': 'tags in datacenter'}])
json.dump(data['result'], open("new_file", "w"))
response_data_new = requests.get(url, params=params_set, json=json.dumps(data['result']))
# response_data_new = requests.put(url, params=params_set)
# requests.post(url, params=params_set)
print(response_data_new.text)
# print(data['result'])
# json.dump(data['result'], open("new_file", "w"))
When I use curl every thing works well with a success message:
{"result": null, "result_code": 0}
Do you have any idea what causes the error?
Thanks
I found the mistake, it was just not focused. The Data variable contains two keys that are sent as well result at the beginning and result_code at the end, which need to be truncated. I just had to modify the response as follows, and sending the data wit POST:
resp = requests.post(url, params=params_set, data={'request': json.dumps(data['result'])})
Thanks #DeepSpace

Login to Website using Python and __RequestVerificationToken

I've been trying to login to a website using python so I can access some photos that are behind a login screen. I've seen a bunch of examples here, but none of them seem to work. Here is my code:
#!/usr/local/bin/python3
import requests
from bs4 import BeautifulSoup
if __name__ == "__main__":
s = requests.Session()
url = 'http://www.usafawebguy.com/Account/Login'
g = s.get(url)
token = BeautifulSoup(g.text, 'html.parser').find('input',{'name':'__RequestVerificationToken'})['value']
print (token)
payload = { 'UserName': 'username',
'Password': 'password',
'RememberMe': 'true',
'ReturnURL': '/Photos/2022?page=1',
'__RequestVerificationToken': token }
p = s.post(url, data=payload)
soup = BeautifulSoup(p.text, 'html.parser')
print (soup.title)
#print (p.text)
r = s.get('https://www.usafawebguy.com/Photos/2022?page=1')
soup = BeautifulSoup(r.text, 'html.parser')
print (soup.title)
#print (r.text)
It always brings me back to the login screen. Thanks in advance!
A one character change: The url variable needs to be set to: "https://....", not "http://....". Live and learn.

Resources