Response with scrapy shows wrong or other data - python-3.x

I am scratching a page, but when I request a link with all the information it shows me that the data does not exist, but I check the json with the firefox inspector and the response has all the information, I have manipulated the headers, but I have not succeeded in getting me show the data.
my code:
settings.py:
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0'
ROBOTSTXT_OBEY = False
CONCURRENT_REQUESTS = 1
DOWNLOAD_DELAY = 3
COOKIES_ENABLED = False
mi_spider.py:
from scrapy import Spider
from scrapy.http import Request
from json import loads, dump
N_categoria = 0
API_key = 'P1MfFHfQMOtL16Zpg36NcntJYCLFm8FqFfudnavl'
class MetrocScrapingSpider(Spider):
name = 'metroc_scraping'
allowed_domains = ['metrocuadrado.com']
start_urls = ['https://www.metrocuadrado.com/']
def parse(self, response):
print()
print('Entra aca 1')
print()
aptos_links = response.xpath('//*[#class= "box-list"]')[N_categoria].xpath('.//li//a/#href').extract()
data_links = []
for url in aptos_links:
items = {}
url = url.split('.com')[-1].split('/')
for ind, info in enumerate(url):
if info == '':
url.pop(ind)
items['inmu_'] = url[0]
items['type_'] = url[1]
items['loc_'] = url[-1]
data_links.append(items)
n_cat = 1
yield Request(url= response.url,
callback= self.first_parse,
meta= {'data_links': data_links,
'n_cat': n_cat,
'aptos_links': aptos_links},
dont_filter= True)
def first_parse(self, response):
data_links = response.meta['data_links']
n_cat = response.meta['n_cat']
aptos_links = response.meta['aptos_links']
n_from = 0
cat_linl = aptos_links[n_cat]
data_link = data_links[n_cat]
print(data_link)
inmu_ = data_link['inmu_']
type_ = data_link['type_']
loc_ = data_link['loc_']
api_link = 'https://www.metrocuadrado.com/rest-search/search?realEstateTypeList='+inmu_+'&realEstateBusinessList='+type_+'&city='+loc_+'&from='
yield Request(url= api_link + str(n_from) + '&size=50',
callback= self.main_parse,
meta= {'data_links': data_links,
'n_cat': n_cat,
'n_from': n_from,
'api_link': api_link},
dont_filter= True,
headers= {'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3',
'Connection': 'keep-alive',
'DNT': '1',
'Host': 'www.metrocuadrado.com',
'Upgrade-Insecure-Requests': '1',
'Referer': cat_linl,
'Pragma': 'no-cache',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0',
'X-Api-Key': API_key,
'X-Requested-With': 'XMLHttpRequest'})
def main_parse(self, response):
print()
print(response.url)
print()
print(response.status)
print()
jsonresponse = loads(response.text)
print(jsonresponse)
below the link and the response status, is the json response
as you can see, "totalHits" is 0, "totalEntries" is 0 too and results is empty. But, if you look at the firefox inspector:
screenshot of the request headers
a part of the response in firefox inspector (i dont know if is hard to see but "totalHits" is 3135 and "totalEntries" 3135:
enter image description here
i don't know why it happens, any help please ?

Related

Response provided for One Page(200) but not for Other(401)

On the same website here, a consistent valid response is provided for One Page(200-url/nifty) but not provided for anotherpage(401-url/oispurtscontracts).
It does provide a valid response sometimes and other times returns a 401 error.
Browser cache cleared and reloaded.
Please provide a solution.
Error :
response.status_code = 401 for https://www.nseindia.com/api/live-analysis-oi-spurts-contracts
Code:
import requests
def connRequest(url,headers):
session = requests.Session()
request = session.get(url, headers=headers)
cookies = dict(request.cookies)
# print(cookies)
print(f"response.status_code = {request.status_code} for {url}")
response = session.get(url, headers=headers, cookies=cookies).json()
print(f"response = {response}")
return response
# Working - Response Provided
def nifty_Working():
url = 'https://www.nseindia.com/api/option-chain-indices?symbol=NIFTY'
# data = requests.get(url)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.5',
'Accept':'application/json'
}
response = connRequest(url, headers)
# 401 Error
def oiSpurtsContracts_NotWorking():
url = 'https://www.nseindia.com/api/live-analysis-oi-spurts-contracts'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en',
'Accept': 'application/json'
}
response = connRequest(url, headers)
def main():
# Working - Response Provided
nifty_Working()
print()
print()
print()
print()
# 401 Error
time.sleep(1)
oiSpurtsContracts_NotWorking()
main()

Scraping values from View Source using Requests Python 3

So this code below is working fine but when i change the url to another site it doesn't work
import requests
import re
url = "https://www.autotrader.ca/a/ram/1500/hamilton/ontario/19_12052335_/?showcpo=ShowCpo&ncse=no&ursrc=pl&urp=2&urm=8&sprx=-2"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
response = requests.get(url, headers=headers)
phone_number = re.findall('"phoneNumber":"([\d-]+)"', response.text)
print(phone_number)
['905-870-7127']
This code below doesn't work it gives the output [] Please tell me what am i doing wrong
import requests
import re
urls = "https://www.kijijiautos.ca/vip/22686710/","https://www.kijijiautos.ca/vip/22686710/"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
for url in urls:
response = requests.get(url, headers=headers)
number = re.findall('"number":"([\d-]+)"', response.text)
print(number)
[]
I think you are not getting The HTTP 200 OK success status as a response.for that cause you are unable to get the exptected ouptput. To get the HTTP 200 OK success status, I have changed the headers from inspecting http requests.
please try this
import requests
import re
import requests
headers = {
'authority': 'www.kijijiautos.ca',
'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
'pragma': 'no-cache',
'accept-language': 'en-CA',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
'content-type': 'application/json',
'accept': 'application/json',
'cache-control': 'no-cache',
'x-client-id': 'c89e7ff8-1d5a-4c2b-a095-c08dc08ccd3b',
'x-client': 'ca.move.web.app',
'sec-ch-ua-platform': '"Linux"',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://www.kijijiautos.ca/cars/hyundai/sonata/used/',
'cookie': 'mvcid=c89e7ff8-1d5a-4c2b-a095-c08dc08ccd3b; locale=en-CA; trty=e; _gcl_au=1.1.1363596757.1633936124; _ga=GA1.2.1193080228.1633936126; _gid=GA1.2.71842091.1633936126; AAMC_kijiji_0=REGION%7C3; aam_uuid=43389576784435124231935699643302941454; _fbp=fb.1.1633936286669.1508597061; __gads=ID=bb71a6fc168c1c33:T=1633936286:S=ALNI_MZk3lgy-9xgSGLPnfrkBET60uS6fA; GCLB=COyIgrWs-PWPsQE; lux_uid=163402080128473094; cto_bundle=zxCnjF95NFglMkZrTG5EZ2dzNHFSdjJ6QSUyQkJvM1BUbk5WTkpjTms0aWdZb3RxZUR3Qk1nd1BmcSUyQjZaZVFUdFdSelpua3pKQjFhTFk0U2ViTHVZbVg5ODVBNGJkZ2NqUGg1cHZJN3V0MWlwRkQwb1htcm5nNDlqJTJGUUN3bmt6ZFkzU1J0bjMzMyUyRkt5aGVqWTJ5RVJCa2ZJQUwxcFJnJTNEJTNE; _td=7f855061-c320-4570-b2d2-73c94bd22b13; rbzid=54THgSkyCRKwhVBqy+iHmjb1RG+uE6uH1XjpsXIazO5nO45GtpIXHGYii/PbJcdG3ahjIgKaBrjh0Yx2J6YCOLHEv3QYL559oz3jQaVrssH2/1Ui9buvIpuCwBOGG2xXGWW2qvcU5807PGsdubQDUvLkxmy4sor+4EzCI1OoUHMOG2asQwsgChqwzJixVvrE21E/NJdRfDLlejb5WeGEgU4B3dOYH95yYf5h+7fxV6H/XLhqbNa8e41DM3scfyeYWeqWCWmOH2VWZ7i3oQ0OXW1SkobLy0D6G+V9J5QMxb0=; rbzsessionid=ca53a07d3404ca93b3f8bc879291dc83; _uetsid=131a47702a6211ecba407d9ff6588dde; _uetvid=131b13602a6211ecacd0b56b0815e9b2',
}
response = requests.get('https://www.kijijiautos.ca/consumer/svc/a/22686710', headers=headers)
if response.status_code == 200:
# print(response.text)
numbers = re.findall(r'"number":"\+\d+"', response.text) # number one or more
print(numbers[0])
else:
print('status code is ', response.status_code)
output
# "number":"+17169905088"

How to get the all content below the "colspan" selector?

I am sending a POST request to filter the IDs then parse the output. I need to get all the content below the colspan selector (The DESIRED OUTPUT is shown in the end). Under colspan="4" there are lots of b,table tags and it contains tbody > tr > td but my script returns only b tags' content.
The URL: https://e-mehkeme.gov.az/Public/Cases
import requests
from bs4 import BeautifulSoup as bs
request_headers = {
'authority': 'e-mehkeme.gov.az',
'method': 'POST',
'path': '/Public/Cases',
'scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
'application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en,en-GB;q=0.9',
'cache-control': 'max-age=0',
'content-length': '66',
'content-type': 'application/x-www-form-urlencoded',
'origin': 'https://e-mehkeme.gov.az',
'referer': 'https://e-mehkeme.gov.az/Public/Cases',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/75.0.3770.142 Safari/537.36',
}
voens = {'1303450301',
'1700393071',
'2002283071',
}
form_data = {
'CourtId': '',
'CaseNo': '',
'DocFin': '',
'DocSeries': '',
'DocNumber': '',
'VOEN': voens,
'button': 'Search',
}
url = 'https://e-mehkeme.gov.az/Public/Cases?courtid='
response = requests.post(url, data=form_data, headers=request_headers)
s = bs(response.content, 'lxml')
# PRINT THE HEADERS!
sHeader = s.findAll('tr', {'class': 'centeredheader'})[0]
headers = [sHeader.get_text().strip()]
print(headers)
# PRINT THE CONTENTS OF EACH SEARCH!
for voen in voens:
form_data['VOEN'] = voen
idData = [string for string in s.select("td", colspan_="4")]
print(idData)
DESIRED OUTPUT:
Ətraflı məlumat:
İşə baxan hakim və ya tərkib
Abiddin Hüseynov - sədrlik edən hakim
Azad İmanov - tərkib üzvü
Vahid Sadıqov - tərkib üzvü
Tərəflər
Cavabdeh: ƏLİYEV HAFİZ RAMİZ
İddiaçı: "OPTİMAL ELEKTRONİKA" MƏHDUD MƏSULİYYƏTLİ CƏMİYYƏTİ
İşin mahiyyəti
Müqavilələrdən əmələ gələn öhdəliklər üzrə mübahisələr
You need to make a POST with each updated voen value in a loop and extract the ids and make new requests.
import requests,re
from bs4 import BeautifulSoup as bs
data = {'VOEN': '', 'button': 'Search'}
voens = ['1303450301', '1700393071', '2002283071']
for voen in voens:
data['VOEN'] = voen
r = requests.post('https://e-mehkeme.gov.az/Public/Cases', data=data)
soup = bs(r.text, 'lxml')
ids = [i['value'] for i in soup.select('.casedetail')]
for i in ids:
r = requests.get(f'https://e-mehkeme.gov.az/Public/CaseDetail?caseId={i}')
soup = bs(r.content, 'lxml')
print([re.sub('\n|\r|\n','',i.text.strip()) for i in soup.select('[colspan="4"]')])

why i can't get the result from lagou this web site by using web scraping

I m using python 3.6.5 and my os system is macOS 10.13.6.
I m learning Web Scraping and I want to catch data from this web site(https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=)
Here is my code:
# encoding: utf-8
import requests
from lxml import etree
def parse_list_page():
url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
'X-Anit-Forge-Code':'0',
'X-Anit-Forge-Token':None,
'X-Requested-With':'XMLHttpRequest',
}
data = {
'first':'false',
'pn':1,
'kd':'python',
}
response = requests.post(url,headers=headers,data=data)
print(response.json())
def main():
parse_list_page()
if __name__ == '__main__':
main()
I m appreciate you for spending the time to answer my question.
I got the answer, here is the code below:
# encoding: utf-8
import requests
from lxml import etree
import time
def parse_list_page():
url = 'https://www.lagou.com/jobs/list_python?px=default&city=%E6%B7%B1%E5%9C%B3'
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/',
'Connection':'keep-alive',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
'Upgrade-Insecure-Requests':'1',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Cache-Control':'no-cache',
'Pragma':'no-cache',
}
response = requests.get(url,headers=headers)
# print(response.text)
r = requests.utils.dict_from_cookiejar(response.cookies)
print(r)
print('='*30)
# r['LGUID'] = r['LGRID']
# r['user_trace_token'] = r['LGRID']
# r['LGSID'] = r['LGRID']
cookies = {
# 'X_MIDDLE_TOKEN':'df7c1d3cfdf279f0caf13df990723620',
# 'JSESSIONID':'ABAAABAAAIAACBI29FE9BDFB6838D8DD69C580E517292C9',
# '_ga':'GA1.2.820168368.1551196380',
# '_gat':'1',
# 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6':'1551196381',
# 'user_trace_token':'20190226235303-99bc357a-39de-11e9-921f-525400f775ce',
# 'LGSID':'20190311094827-c3bc2393-439f-11e9-a15a-525400f775ce',
# 'PRE_UTM':'',
# 'PRE_HOST':'',
# 'PRE_SITE':'',
# 'PRE_LAND':'https%3A%2F%2Fwww.lagou.com%2F',
# 'LGUID':'20190226235303-99bc3944-39de-11e9-921f-525400f775ce',
# '_gid':'GA1.2.1391680888.1552248111',
# 'index_location_city':'%E6%B7%B1%E5%9C%B3',
# 'TG-TRACK-CODE':'index_search',
# 'LGRID':'20190311100452-0ed0525c-43a2-11e9-9113-5254005c3644',
# 'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6':'1552269893',
# 'SEARCH_ID':'aae3c38ec76545fc86cd4e23153afe44',
}
cookies.update(r)
print(r)
print('=' * 30)
print(cookies)
print('=' * 30)
headers = {
'Origin':'https://www.lagou.com',
'X-Anit-Forge-Code': '0',
'X-Anit-Forge-Token': None,
'X-Requested-With': 'XMLHttpRequest',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'h-CN,zh;q=0.9,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www.lagou.com/jobs/list_python?px=default&city=%E6%B7%B1%E5%9C%B3',
'Connection': 'keep-alive',
}
params = {
'px':'default',
'city':'深圳',
'needAddtionalResult':'false'
}
data = {
'first':'true',
'pn':1,
'kd':'python',
}
url_json = 'https://www.lagou.com/jobs/positionAjax.json'
response = requests.post(url=url_json,headers=headers,params=params,cookies=cookies,data=data)
print(response.json())
def main():
parse_list_page()
if __name__ == '__main__':
main()
The reason why I can't get the json as response is the against web scraping rules here is you need to use the first cookie when you send the request.
so when you first send the request you need to save the cookies and then update it to use your second page request. Hope it will helpful for you to do web scraping when you face this problem

Python3 requests post correctly but get nothing(but by browser is ok)

When I visit 'https://baike.baidu.com/wikitag/taglist?tagId=75953' on chrome,through fiddler I find the browser sends a post request to 'https://baike.baidu.com//wikitag/api/getlemmas'.
So I'm trying to send a 'POST' request with form data to the url:'https://baike.baidu.com//wikitag/api/getlemmas' and get the JSON data from its 'response' request.
I get all the headers and form data through the Fiddler and try to send the same 'POST' request by python3 using requests package.
But even I send the 'POST' request with the same headers and form data, I get the request(status:200) with an empty body.
the same request I send by 'postman' is also all right, but by python3 I failed anyway.
# -*- coding:UTF-8 -*-
import requests
def disease_json():
host = 'https://baike.baidu.com'
target = host + '/wikitag/api/getlemmas'
cookies={
'BAIDUID':'EEE35ACB030447144E615B191397065B:FG=1;PSTM=1523192637;BIDUPSID=B34DD366905D15BB907C1667346970AE;Hm_lvt_55b574651fcae74b0a9f1cf9c8d7c93a=1522304864,1522305101,1523192946,1523253565;PSINO=2;H_PS_PSSID=1990_1438_26082_21 125_22074;BDORZ=B490B5EBF6F3CD402E515D22BCDA1598'
}
headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
'Content-Length':'91',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Referer':'https://baike.baidu.com/wikitag/taglist?tagId=75953',
'Origin':'https://baike.baidu.com',
'Connection':'keep-alive',
'Host':'baike.baidu.com',
}
forms = {
'limit': '24',
'timeout': '3000',
'filterTags':'[]',
'tagID': '75953',
'fromLemma': 'false',
'contentLength': '40',
'page': '0',
}
req=requests.post(url=target,data=forms,verify=False,headers=headers)
print(req.text)
"""
html = json.loads(req.text)
for each in html['lemmaList']:
print('lemmaCroppedTitle:',each['lemmaCroppedTitle'])
print(req.text)
"""
def main():
disease_json()
if __name__ == '__main__':
main()
Following is the correct request sent by browser:
Modified content-type and your request payload. Also added method encode_multipart_data for payload transformation to be consistent with multipart-form-data
import sys
import requests
def encode_multipart_data(fields):
boundary = '------WebKitFormBoundary7MA4YWxkTrZu0gW'
CRLF = '\r\n'
L = []
for key, value in fields.items():
L.append(boundary)
L.append('Content-Disposition: form-data; name="%s"\r\n' % key)
L.append(value)
L.append(boundary + "--")
body = CRLF.join(L)
return body
def disease_json():
host = 'https://baike.baidu.com'
target = host + '/wikitag/api/getlemmas'
headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
# changed content-type
'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW",
'Referer':'https://baike.baidu.com/wikitag/taglist?tagId=75953',
'Origin':'https://baike.baidu.com',
'Connection':'keep-alive',
'Host':'baike.baidu.com'
}
forms = {
'limit': '24',
'timeout': '3000',
'filterTags': '[]',
'tagId': '75953',
'fromLemma': 'false',
'contentLength': '40',
'page': '0',
}
payload = encode_multipart_data(forms)
resp = requests.post(url=target, data=payload, headers=headers)
print(resp.text)
if __name__ == '__main__':
disease_json()
This way can also solve the problem.
import requests
import http.cookiejar
import json
url = "https://baike.baidu.com/wikitag/api/getlemmas"
payload = "limit=24&timeout=3000&filtetTags=%5B%5D&tagId=75953&fromLemma=false&contentLegth=40&page=0"
headers = {
'Content-Type': "application/x-www-form-urlencoded",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181\
Safari/537.36"
}
def get_cookies():
session = requests.Session()
session.cookies = http.cookiejar.LWPCookieJar("cookie")
response = session.post(url, headers=headers, data=payload, allow_redirects=False,verify=False)
session.cookies.save(ignore_discard=True, ignore_expires=True)
return response
def disease_json(times=-1):
times += 1
response = get_cookies()
if response.status_code == 302:
session = requests.session()
session.cookies = http.cookiejar.LWPCookieJar(filename='cookie')
session.cookies.load(ignore_discard=True)
url = response.headers['Location']
response = session.post(url, headers=headers, data=payload, allow_redirects=False)
json_data = response.text
print(json.loads(json_data))
print(times)
if __name__ == '__main__':
disease_json()

Resources