How to implement 'ThreadPoolExecutor' in Python? - multithreading

Actually, I have scraped the 'Nykaa Website'. But Problem is that it's taking around 10 days, to fetch the complete data. Which is very time-consuming.
Basically, I want to optimize or automate this Code, so it takes the least time. Recently, I have learned about the concept of 'Threading' and 'ThreadPoolExecutor'. So, Threading will speed up our tasks, by running the requests concurrently.
Now, This is the Code, which I've tried to Implement 👇🏻
from scrapy.loader.processors import MapCompose
from scrapy import Spider
from scrapy import Request, FormRequest
from scrapy.selector import Selector
from w3lib.html import remove_tags
from scrapy.loader import ItemLoader
import json
import re
from scrapy.loader import ItemLoader
from couponsscraper.loaders import ListingLoader
import time
import concurrent.futures
class MySpider(Spider):
name = 'nykaa_comla'
custom_settings = {
# "PROXY_ON": True,
"HTTPCACHE_ENABLED": False,
"CONCURRENT_REQUESTS": 4,
"COOKIES_ENABLED": True,
"AUTOTHROTTLE_ENABLED": True,
"RETRY_TIMES": 5,
"DOWNLOAD_DELAY": 1,
"RETRY_HTTP_CODES": [500, 503, 504, 400, 401, 403, 405, 404, 408, 416, 456, 502, 429, 307]
}
headers = {
# "accept": "application/json, text/plain, */*",
# "accept-encoding": "gzip, deflate, br",
# "accept-language": "tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7",
# "cookie": "bcookie=3a6db8c0-6226-4d67-9b3c-39ee6fb7002b; EXP_ADP_RV_REORDER=A; EXP_ADP_RV_SEGMENT=B; run=43; D_LST=1; D_PDP=1; _gcl_au=1.1.515177829.1644933995; _ga=GA1.2.1857014358.1644933996; _gid=GA1.2.1588295387.1644933996; d_info=1536_864; N_S_A_W=1; N_A_P=1; GPAY_INTENT=1; D_HOM=1; N_CMS=0; PHPSESSID=041dvmhr4r3nqj0mt051pid5q0; disableSmartLogin=true; SHOW_PREVIEW_INFO_PAGE=false; OLD_LISTING_FLOW=false; GC_PREVIEW_EXPERIMENT=true; NEW_PRE_CONFIRMATION=true; C_AUTH=false; NEW_PAYMENT_PAGE=false; VIEW_COUPON_EXPERIMENT=true; NEW_ORDERLISTING_PAGE=true; NEW_ORDERDETAIL_PAGE=false; AMCVS_FE9A65E655E6E38A7F000101%40AdobeOrg=1; s_cc=true; frontendSource=react; pro=false; head_data_react={}; form_key=FFGAZIZcYSy6guaw; s_nr=1644987663041-Repeat; countryCode=TR; storeId=nykaa; lux_uid=164502537781396625; AMCV_FE9A65E655E6E38A7F000101%40AdobeOrg=-432600572%7CMCIDTS%7C19039%7CMCMID%7C81426820227033618243066641334787189520%7CMCAAMLH-1645633715%7C6%7CMCAAMB-1645633715%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1645036115s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C4.5.2; SITE_VISIT_COUNT=85; s_nr365=1645030255086-Repeat; cto_bundle=yZ-Tl19IZXRJdEFwYmJiRFlKdEUyRnMwdTJSYU8xanJLMjklMkJFZTdkSk1rZ2lobHFpRFU1RXRvNTFFOW5LM0x0NXE4RCUyQlk0eW5hZHklMkI1bjFIODFPeiUyQlljNXlUWnJSNTglMkJINU5LR1h6NlBVSWNROVNqNHZyazFBeVhiYzR0RSUyRmNiemQlMkYlMkJQMktDYlRiZ1pFbUVORlZoWERKbzFRJTNEJTNE; TS017d4a61=01317c4d00f4b4ecb9729e778b9c17dcf4ff9e0bde970f7d1b21ba9c6b22be6412bd39ff683b78cec4d5d72fa884218be5d8643776; TS5e83d05f027=08d3514ca6ab2000f82ea30abe3eacc9f428099d8668cb41a92cd2d81928f740933aee4731fd569b083d5530361130005c7a54f3c04dfcf2867d258ecec44f2412921fab91b839896e317f4c2953ff0d9fb3c2db28b8cfda2319bf9176d7af9f; s_sq=fsnecommerceprod%3D%2526c.%2526a.%2526activitymap.%2526page%253DNykaa%2526link%253DHair%252520Removal%252520Tools%2525202%2526region%253Dcustom-scroll%2526pageIDType%253D1%2526.activitymap%2526.a%2526.c%2526pid%253DNykaa%2526pidt%253D1%2526oid%253Dfunctioncn%252528%252529%25257B%25257D%2526oidt%253D2%2526ot%253DDIV",
# "referer": "https://www.nykaa.com/personal-care-appliances/c/1390?page_no=1&sort=popularity&eq=desktop&category_filter=1408,1412,1411,1543,6290,1405,1404,1517,41,43,44,1399,1400,1406",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
}
def start_requests(self):
start_urls = [
{
"url": [
"https://www.nykaa.com/app-api/index.php/products/list?category_filter=1408%2C1412%2C1411%2C1543%2C6290%2C1405%2C1404%2C1517%2C41%2C43%2C44%2C1399%2C1400%2C1406%2C1401&category_id=1390&client=react&filter_format=v2&page_no=1&platform=website&sort=popularity",
"https://www.nykaa.com/app-api/index.php/products/list?category_filter=8446%2C8452%2C8443%2C8444%2C8441%2C8445%2C8448%2C8397%2C8442%2C8394%2C8449%2C8453%2C8451%2C8395%2C8396%2C8398%2C8420%2C8426%2C8644%2C8421%2C8427%2C8423%2C14001%2C8422%2C8424%2C8436%2C8438%2C8437%2C8382%2C8383%2C8381%2C8380%2C8379%2C8400%2C8401%2C8392%2C8429%2C8430%2C8409%2C8410%2C8411%2C8432%2C8433%2C8434%2C8405%2C8407%2C8406%2C8403%2C8404&category_id=8377&client=react&filter_format=v2&page_no=1&platform=website&sort=popularity",
"https://www.nykaa.com/app-api/index.php/products/list?category_filter=242%2C240%2C2440%2C687%2C241%2C247%2C245%2C244%2C931%2C1995%2C228%2C235%2C237%2C233%2C234%2C231%2C236%2C239%2C6761%2C4140%2C232%2C229%2C220%2C249%2C263%2C254%2C250%2C251%2C253%2C5079%2C252%2C7008%2C267%2C268%2C4037%2C271%2C266%2C269%2C270%2C273%2C272%2C947%2C1513%2C1514%2C3819%2C3745%2C3749%2C3746%2C255%2C3277%2C257%2C256%2C260&category_id=12&client=react&filter_format=v2&page_no=1&platform=website&sort=popularity",
"https://www.nykaa.com/app-api/index.php/products/list?category_filter=2444%2C2048%2C361%2C362%2C364%2C363%2C2049%2C7007%2C316%2C319%2C2046%2C2040%2C320%2C2041%2C22630%2C317%2C1214%2C1222%2C2746%2C2045%2C2043%2C331%2C332%2C346%2C329%2C11111&category_id=24&client=react&filter_format=v2&page_no=1&platform=website&sort=popularity",
"https://www.nykaa.com/app-api/index.php/products/list?category_filter=13561%2C6790%2C1642%2C1645%2C326%2C13562%2C368%2C367%2C369%2C370%2C39%2C391%2C1654%2C9475%2C1647%2C690%2C1816%2C381%2C584%2C385%2C382%2C7466%2C371%2C377%2C374%2C1389%2C688%2C7010%2C396%2C1386%2C1640%2C663%2C1133%2C1660%2C662%2C290%2C7009%2C2086%2C14358&category_id=12930&client=react&filter_format=v2&page_no=1&platform=website&sort=popularity",
"https://www.nykaa.com/app-api/index.php/products/list?category_filter=9607%2C9606%2C9611%2C9608%2C9609%2C9613%2C9612%2C9610%2C9626%2C9627%2C9625%2C9624%2C9628%2C9631%2C9630%2C9632%2C9629%2C9582%2C9581%2C9567%2C9571%2C9584%2C9569%2C9566%2C9579%2C9576%2C9568%2C9634%2C9637%2C9636%2C9635&category_id=12930&client=react&filter_format=v2&page_no=1&platform=website&sort=popularity",
"https://www.nykaa.com/app-api/index.php/products/list?category_filter=14799%2C14801%2C14806%2C14805%2C14800%2C14803%2C14834%2C15013%2C15010%2C14802%2C14804%2C14807%2C14808%2C15009%2C14809%2C14828%2C14829%2C15357%2C15123%2C15124%2C14838%2C14839%2C14844%2C14840%2C14836%2C22633%2C14824%2C14820%2C14822%2C14819%2C14821%2C14823%2C15129%2C16966%2C16969%2C16964%2C16968%2C16970%2C15358%2C14846%2C14816%2C14813%2C14815%2C14814%2C14812%2C14817%2C14811%2C15126%2C15127%2C14848%2C14835%2C15125&category_id=14797&client=react&filter_format=v2&page_no=1&platform=website&sort=popularity",
"https://www.nykaa.com/app-api/index.php/products/list?category_filter=19868%2C19869%2C20421%2C19867%2C19864%2C1657%2C1656%2C1658%2C18549%2C18550%2C18547%2C19894%2C18548%2C18556%2C18555%2C18553%2C1620%2C18552%2C18554%2C2027%2C19878%2C19856%2C1614%2C19881%2C19876%2C19857%2C19875%2C1618%2C19879%2C19877%2C19870%2C18577%2C18575%2C18573%2C18698%2C18574%2C19872%2C18570%2C18697%2C18579%2C19871%2C18578%2C18569%2C18542%2C2020%2C18544%2C18543%2C18541%2C18551%2C18558%2C18561%2C18567%2C18559%2C18566%2C18560%2C18565%2C18564%2C18563%2C19859%2C19861%2C18562%2C19858%2C19860&category_id=671&client=react&filter_format=v2&page_no=1&platform=website&sort=popularity",
"https://www.nykaa.com/app-api/index.php/products/list?category_filter=1324%2C1323%2C328%2C1287%2C568%2C11028%2C1289%2C1548%2C1291%2C11029%2C1290%2C1288%2C11045%2C11216%2C11054%2C11047%2C11046%2C2803%2C11048%2C2801%2C2799%2C1294%2C1415%2C1297%2C1298%2C1293%2C1296%2C1295%2C1414%2C1413%2C1308%2C1307%2C7034%2C1316%2C1313%2C1314%2C1312%2C1311%2C1328%2C6543%2C1326%2C1329%2C1305%2C1303%2C1306%2C1304%2C1302%2C1301&category_id=9150&client=react&filter_format=v2&page_no=1&platform=website&sort=popularity",
"https://www.nykaa.com/app-api/index.php/products/list?category_filter=974%2C979%2C1235%2C975%2C962%2C971%2C970%2C18521%2C3126%2C18313%2C18319%2C18311&category_id=53&client=react&filter_format=v2&page_no=1&platform=website&sort=popularity"
],
},
] # LEVEL 1
t1 = time.perf_counter()
def download_urls(start_urls):
for url in start_urls:
for item in url.get('url'):
yield Request(
url=item,
callback=self.jump, headers=self.headers
)
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(download_urls, start_urls)
t2 = time.perf_counter()
print(f'Finished in {t2-t1} seconds')
def jump(self, response):
headers = {
# "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
# "cookie": "bcookie=3a6db8c0-6226-4d67-9b3c-39ee6fb7002b; EXP_ADP_RV_REORDER=A; EXP_ADP_RV_SEGMENT=B; run=43; D_LST=1; D_PDP=1; _gcl_au=1.1.515177829.1644933995; _ga=GA1.2.1857014358.1644933996; _gid=GA1.2.1588295387.1644933996; AMCVS_FE9A65E655E6E38A7F000101%40AdobeOrg=1; d_info=1536_864; s_cc=true; frontendSource=react; N_S_A_W=1; N_A_P=1; GPAY_INTENT=1; D_HOM=1; N_CMS=0; AMCV_FE9A65E655E6E38A7F000101%40AdobeOrg=-432600572%7CMCIDTS%7C19039%7CMCMID%7C81426820227033618243066641334787189520%7CMCAAMLH-1645555116%7C6%7CMCAAMB-1645555116%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1644957516s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C4.5.2; countryCode=TR; storeId=nykaa; lux_uid=164495373425081540; SITE_VISIT_COUNT=42; s_nr365=1644954403255-Repeat; cto_bundle=FmfdrF9IZXRJdEFwYmJiRFlKdEUyRnMwdTJacUclMkZESTVtbFJoeSUyRm9DeWFuZnNyUHB1Wmp6Mjd5aSUyRnpoJTJGUW9adHBvJTJGaDklMkZVU2JqeTBmUnNQNWRZUkFlZFdkU01HOUtFN1FVM2NqQ2hyJTJCSWNCRHVqWXVIYmxGSmtYalY1ZWVlNzhuOEFFeiUyRmJkYW0lMkY1eExpWTIlMkI5UWFVTk9xUSUzRCUzRA; s_sq=fsnecommerceprod%3D%2526c.%2526a.%2526activitymap.%2526page%253DNykaa%2526link%253DCategory%2526region%253Dfirst-filter%2526pageIDType%253D1%2526.activitymap%2526.a%2526.c%2526pid%253DNykaa%2526pidt%253D1%2526oid%253Dfunctioncn%252528%252529%25257B%25257D%2526oidt%253D2%2526ot%253DDIV; TS017d4a61=01317c4d00801f0f349c15b870e54bb5b073295e5cd3300ec6151f9df175201cd7088a4fddc721927b22edfb27519afe45b7706946; TS5e83d05f027=08d3514ca6ab200050d2f52481b8d6300143ac438c162c3b9a85f08eef7a692b65d0a08e95fa92ca08337d71c4113000d7f4a8888b3d70b89797c2274d4e26fe457de218890038f0d09704e9c58b3c0655ad652c93a0e031aa4d0ee666767637",
# "upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36"
}
page = response.meta.get('page', 2)
seen = False
data = json.loads(response.body)['response']['products']
for item in data:
try:
main = item['primary_categories']['l1']['name']
sub = item['primary_categories']['l2']['name']
child = item['primary_categories']['l3']['name']
follow_url = item['product_url']
yield Request(follow_url, callback=self.parse, meta={'main_cat': main, 'sub_cat': sub, 'child_cat': child}, headers=headers)
seen = True
except:
pass
if page == 2 or seen:
headersa = {
# "accept": "application/json, text/plain, */*",
# "accept-encoding": "gzip, deflate, br",
# "accept-language": "tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7",
# "cookie": "bcookie=3a6db8c0-6226-4d67-9b3c-39ee6fb7002b; EXP_ADP_RV_REORDER=A; EXP_ADP_RV_SEGMENT=B; run=43; D_LST=1; D_PDP=1; _gcl_au=1.1.515177829.1644933995; _ga=GA1.2.1857014358.1644933996; _gid=GA1.2.1588295387.1644933996; d_info=1536_864; N_S_A_W=1; N_A_P=1; GPAY_INTENT=1; D_HOM=1; N_CMS=0; PHPSESSID=041dvmhr4r3nqj0mt051pid5q0; disableSmartLogin=true; SHOW_PREVIEW_INFO_PAGE=false; OLD_LISTING_FLOW=false; GC_PREVIEW_EXPERIMENT=true; NEW_PRE_CONFIRMATION=true; C_AUTH=false; NEW_PAYMENT_PAGE=false; VIEW_COUPON_EXPERIMENT=true; NEW_ORDERLISTING_PAGE=true; NEW_ORDERDETAIL_PAGE=false; AMCVS_FE9A65E655E6E38A7F000101%40AdobeOrg=1; s_cc=true; frontendSource=react; pro=false; head_data_react={}; form_key=FFGAZIZcYSy6guaw; s_nr=1644987663041-Repeat; countryCode=TR; storeId=nykaa; lux_uid=164502537781396625; AMCV_FE9A65E655E6E38A7F000101%40AdobeOrg=-432600572%7CMCIDTS%7C19039%7CMCMID%7C81426820227033618243066641334787189520%7CMCAAMLH-1645633715%7C6%7CMCAAMB-1645633715%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1645036115s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C4.5.2; SITE_VISIT_COUNT=85; s_nr365=1645030255086-Repeat; cto_bundle=yZ-Tl19IZXRJdEFwYmJiRFlKdEUyRnMwdTJSYU8xanJLMjklMkJFZTdkSk1rZ2lobHFpRFU1RXRvNTFFOW5LM0x0NXE4RCUyQlk0eW5hZHklMkI1bjFIODFPeiUyQlljNXlUWnJSNTglMkJINU5LR1h6NlBVSWNROVNqNHZyazFBeVhiYzR0RSUyRmNiemQlMkYlMkJQMktDYlRiZ1pFbUVORlZoWERKbzFRJTNEJTNE; TS017d4a61=01317c4d00f4b4ecb9729e778b9c17dcf4ff9e0bde970f7d1b21ba9c6b22be6412bd39ff683b78cec4d5d72fa884218be5d8643776; TS5e83d05f027=08d3514ca6ab2000f82ea30abe3eacc9f428099d8668cb41a92cd2d81928f740933aee4731fd569b083d5530361130005c7a54f3c04dfcf2867d258ecec44f2412921fab91b839896e317f4c2953ff0d9fb3c2db28b8cfda2319bf9176d7af9f; s_sq=fsnecommerceprod%3D%2526c.%2526a.%2526activitymap.%2526page%253DNykaa%2526link%253DHair%252520Removal%252520Tools%2525202%2526region%253Dcustom-scroll%2526pageIDType%253D1%2526.activitymap%2526.a%2526.c%2526pid%253DNykaa%2526pidt%253D1%2526oid%253Dfunctioncn%252528%252529%25257B%25257D%2526oidt%253D2%2526ot%253DDIV",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
}
f_url = response.url.replace(
f"page_no={page-1}", f"page_no={page}")
print(f_url)
print("*****************************************************")
if f_url:
yield Request(f_url, callback=self.jump, meta={"page": page+1}, headers=headersa)
# 1. FOLLOWING
def parse(self, response):
item_loader = ListingLoader(response=response)
external_link = response.url
item_loader.add_value("external_link", response.url)
main_cat = str(response.meta.get('main_cat'))
sub_cat = str(response.meta.get('sub_cat'))
child_cat = str(response.meta.get('child_cat'))
title = response.xpath(
"//script[contains(.,'window.__PRELOADED_STATE__ =')]/text()").get()
if title:
title = json.loads(title.split(
"window.__PRELOADED_STATE__ = ")[-1])['productPage']['product']['name']
if title:
item_loader.add_value("title", title)
price = response.xpath("//span[#class='css-12x6n3h']/text()").get()
if price:
price = price.split("₹")[-1]
item_loader.add_value("price", price.split("₹")[-1])
country = response.xpath(
"//script[contains(.,'originOfCountryName')]/text()").get()
if country:
country = country.split("originOfCountryName")[1].split(
",")[0].replace('"', "").split(":")[-1].replace("\n", "").strip()
item_loader.add_value('country', country)
description = response.xpath(
"//script[contains(.,'window.__PRELOADED_STATE__ =')]/text()").get()
if description:
description = json.loads(description.split(
"window.__PRELOADED_STATE__ = ")[-1])['productPage']['product']['description']
sel = Selector(text=description, type='html')
if description:
description = "".join(sel.xpath(".//p//text()").getall())
item_loader.add_value("description", description)
else:
return []
date = response.xpath(
"//script[contains(.,'window.__PRELOADED_STATE__ =')]/text()").get()
if date:
date = json.loads(date.split(
"window.__PRELOADED_STATE__ = ")[-1])['productPage']['product']['expiry']
if date:
item_loader.add_value('date', date)
else:
return []
address = response.xpath(
"//script[contains(.,'window.__PRELOADED_STATE__ =')]/text()").get()
if address:
address = json.loads(address.split(
"window.__PRELOADED_STATE__ = ")[-1])['productPage']['product']['manufacturerAddress']
if address:
address = address.split(
"manufacturerAddress")[-1].split("}")[0].replace('"', "").split(":")[-1]
item_loader.add_value(
'address', address.replace("\n", "").strip())
importer = response.xpath(
"//script[contains(.,'window.__PRELOADED_STATE__ =')]/text()").get()
if importer:
importer = json.loads(importer.split(
"window.__PRELOADED_STATE__ = ")[-1])['productPage']['product']['manufacturerName']
if importer:
importer = importer.split(
"manufacturerAddress")[-1].split("}")[0].replace('"', "").split(":")[-1]
item_loader.add_value(
'importer', importer.replace("\n", "").strip())
yield{
'address': address,
'title': title,
'description': description,
'date': date,
'importer': importer,
'country': country,
'main_cat': main_cat,
'sub_cat': sub_cat,
'child_cat': child_cat,
# 'external_link':external_link
}
Now, This is the Output I'am receiving 👇🏻
Finished in 0.001997700019273907 seconds
Unhandled error in Deferred:
2022-03-01 13:35:36 [twisted] CRITICAL: Unhandled error in Deferred:
Traceback (most recent call last):
File "C:\Users\paart\AppData\Roaming\Python\Python310\site-packages\scrapy\crawler.py", line 192, in crawl
return self._crawl(crawler, *args, **kwargs)
File "C:\Users\paart\AppData\Roaming\Python\Python310\site-packages\scrapy\crawler.py", line 196, in _crawl
d = crawler.crawl(*args, **kwargs)
File "C:\Users\paart\AppData\Roaming\Python\Python310\site-packages\twisted\internet\defer.py", line 1905, in unwindGenerator
return _cancellableInlineCallbacks(gen)
File "C:\Users\paart\AppData\Roaming\Python\Python310\site-packages\twisted\internet\defer.py", line 1815, in _cancellableInlineCallbacks
_inlineCallbacks(None, gen, status)
--- <exception caught here> ---
File "C:\Users\paart\AppData\Roaming\Python\Python310\site-packages\twisted\internet\defer.py", line 1660, in _inlineCallbacks
result = current_context.run(gen.send, result)
File "C:\Users\paart\AppData\Roaming\Python\Python310\site-packages\scrapy\crawler.py", line 88, in crawl
start_requests = iter(self.spider.start_requests())
builtins.TypeError: 'NoneType' object is not iterable
2022-03-01 13:35:36 [twisted] CRITICAL:
Traceback (most recent call last):
File "C:\Users\paart\AppData\Roaming\Python\Python310\site-packages\twisted\internet\defer.py", line 1660, in _inlineCallbacks
result = current_context.run(gen.send, result)
File "C:\Users\paart\AppData\Roaming\Python\Python310\site-packages\scrapy\crawler.py", line 88, in crawl
start_requests = iter(self.spider.start_requests())
TypeError: 'NoneType' object is not iterable
PS E:\Web Scraping - Nykaa (New_two)\couponsscraper\couponsscraper>
Please, anyone, help me with this, where I'm going wrong, and if we not use 'threading' should we use 'snakeviz'.
Thanks 🙏🏻

Related

Connection aborted.', RemoteDisconnected('Remote end closed connection without response') while using python

Hello I am attempting to reach https://api.louisvuitton.com/api/eng-us/catalog/availability/M80016 through a session while using request in python. Currently I am unable to reach it and get an error of Remote end closed connection without response.
I have been trying to debug but havent been successful. Bellow is my code and the output.
Code:
import requests
from requests.auth import HTTPBasicAuth
import json
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'}
s = requests.Session()
r = s.get("https://us.louisvuitton.com/eng-us/products/pocket-organizer-damier-graphite-nvprod2630093v#N60432",headers=headers)
if r:
print("Requested Successfully")
else:
print("Request Failed ==> " + str(r))
exit()
url2 = "https://api.qubit.com/graphql"
payload = json.dumps({
"query": "query ($trackingId: String!, $contextId: String!) {\n property(trackingId: $trackingId) {\n visitor(contextId: $contextId) {\n ipAddress\n ipLocation: location {\n city\n cityCode\n country\n countryCode\n latitude\n longitude\n area\n areaCode\n region\n regionCode\n }\n segment: segments {\n state\n }\n history {\n conversionCycleNumber: conversionCycle\n conversionNumber: conversions\n entranceNumber: entrances\n firstConversionTs: firstConversion\n firstViewTs: firstView\n lastConversionTs: lastConversion\n lastViewTs: lastView\n lifetimeValue\n sessionNumber: sessions\n viewNumber: views\n }\n }\n }\n}",
"variables": {
"trackingId": "louisvuitton_prod",
"contextId": "o6vfrf9jm4g-0k999shdp-fiadwa4"
}})
headers2 = {
'Content-Type': 'application/json'
}
x = s.post(url2,headers=headers2, data=payload)
if x:
print("Post Successfully")
else:
print("Post Failed ==> " + str(x))
exit()
headers3 = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)',
'Accept': "*/*",
'Cache-Control': "no-cache",
'Host': "api.louisvuitton.com",
'Accept-Encoding': "gzip, deflate",
'Connection': "keep-alive",
'cache-control': "no-cache",
'Content-Type': 'application/json'
}
cookies = s.cookies
t = s.get("https://api.louisvuitton.com/api/eng-us/catalog/availability/M80016",headers=headers3,cookies=cookies)
if t:
print("Get Successfully")
else:
print("Get Failed ==> " + str(t))
exit()
Output
Requested Successfully
Post Successfully
Traceback (most recent call last):
File "/usr/local/lib/python3.8/site-packages/urllib3-1.25.10-py3.8.egg/urllib3/connectionpool.py", line 670, in urlopen
httplib_response = self._make_request(
File "/usr/local/lib/python3.8/site-packages/urllib3-1.25.10-py3.8.egg/urllib3/connectionpool.py", line 426, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/usr/local/lib/python3.8/site-packages/urllib3-1.25.10-py3.8.egg/urllib3/connectionpool.py", line 421, in _make_request
httplib_response = conn.getresponse()
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1347, in getresponse
response.begin()
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 307, in begin
version, status, reason = self._read_status()
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 276, in _read_status
raise RemoteDisconnected("Remote end closed connection without"
http.client.RemoteDisconnected: Remote end closed connection without response
Anyone have a clue or idea how to resolve this issues? Would appreciated any help.
If you inspect the cookies on the webpage in Chrome with Inspect Element -> application -> storage -> cookies -> https://us.louisvuitton.com/ you see about 40 cookies. However if you add import pprint to your code and at line 50 pprint.pprint(s.cookies.get_dict()) you see only 4 cookies. So you are missing many cookies.
The response you get is actually an Access Denied message as you can see if you use Inspect Element -> Network copy as cURL on the https://api.louisvuitton.com/api/eng-us/catalog/availability/nvprod... URL and remove the cookies except for your 4 and run it, if you run it will all the cookies it works fine.
So as there are many XHR requests than can set cookies I suggest you either go through all requests decode them if needed and read all the JavaScript files to see if they set cookies or a much easier solution use Selenium, requests-html https://pypi.org/project/requests-html/ or PyQT

Response with scrapy shows wrong or other data

I am scratching a page, but when I request a link with all the information it shows me that the data does not exist, but I check the json with the firefox inspector and the response has all the information, I have manipulated the headers, but I have not succeeded in getting me show the data.
my code:
settings.py:
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0'
ROBOTSTXT_OBEY = False
CONCURRENT_REQUESTS = 1
DOWNLOAD_DELAY = 3
COOKIES_ENABLED = False
mi_spider.py:
from scrapy import Spider
from scrapy.http import Request
from json import loads, dump
N_categoria = 0
API_key = 'P1MfFHfQMOtL16Zpg36NcntJYCLFm8FqFfudnavl'
class MetrocScrapingSpider(Spider):
name = 'metroc_scraping'
allowed_domains = ['metrocuadrado.com']
start_urls = ['https://www.metrocuadrado.com/']
def parse(self, response):
print()
print('Entra aca 1')
print()
aptos_links = response.xpath('//*[#class= "box-list"]')[N_categoria].xpath('.//li//a/#href').extract()
data_links = []
for url in aptos_links:
items = {}
url = url.split('.com')[-1].split('/')
for ind, info in enumerate(url):
if info == '':
url.pop(ind)
items['inmu_'] = url[0]
items['type_'] = url[1]
items['loc_'] = url[-1]
data_links.append(items)
n_cat = 1
yield Request(url= response.url,
callback= self.first_parse,
meta= {'data_links': data_links,
'n_cat': n_cat,
'aptos_links': aptos_links},
dont_filter= True)
def first_parse(self, response):
data_links = response.meta['data_links']
n_cat = response.meta['n_cat']
aptos_links = response.meta['aptos_links']
n_from = 0
cat_linl = aptos_links[n_cat]
data_link = data_links[n_cat]
print(data_link)
inmu_ = data_link['inmu_']
type_ = data_link['type_']
loc_ = data_link['loc_']
api_link = 'https://www.metrocuadrado.com/rest-search/search?realEstateTypeList='+inmu_+'&realEstateBusinessList='+type_+'&city='+loc_+'&from='
yield Request(url= api_link + str(n_from) + '&size=50',
callback= self.main_parse,
meta= {'data_links': data_links,
'n_cat': n_cat,
'n_from': n_from,
'api_link': api_link},
dont_filter= True,
headers= {'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3',
'Connection': 'keep-alive',
'DNT': '1',
'Host': 'www.metrocuadrado.com',
'Upgrade-Insecure-Requests': '1',
'Referer': cat_linl,
'Pragma': 'no-cache',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0',
'X-Api-Key': API_key,
'X-Requested-With': 'XMLHttpRequest'})
def main_parse(self, response):
print()
print(response.url)
print()
print(response.status)
print()
jsonresponse = loads(response.text)
print(jsonresponse)
below the link and the response status, is the json response
as you can see, "totalHits" is 0, "totalEntries" is 0 too and results is empty. But, if you look at the firefox inspector:
screenshot of the request headers
a part of the response in firefox inspector (i dont know if is hard to see but "totalHits" is 3135 and "totalEntries" 3135:
enter image description here
i don't know why it happens, any help please ?

Instagram login with tor

i am trying to login to instagram through tor
(i am using python3 on a linux machine if this helps!)
here is the code:
import json
import requests
import os
from colorama import Fore
from stem import Signal
from stem.control import Controller
def tor_session():
session = requests.session()
session.proxies['http'] = 'socks5h://localhost:9050'
session.proxies['https'] = 'socks5h://localhost:9050'
return session
def login(username, password):
# params:
# [string]username- the username of the instagram account to log in to
# [string]password- the password to use in the log in process
# description:
# logs in to the account with the specified username and with the specified password
# session setup
sess = tor_session()
sess.cookies.update({
'sessionid': '',
'mid': '',
'ig_pr': '1',
'ig_vw': '1920',
'csrftoken': '',
's_network': '',
'ds_user_id': ''
})
sess.headers.update({
'UserAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'x-instagram-ajax': '1',
'X-Requested-With': 'XMLHttpRequest',
'origin': 'https://www.instagram.com',
'ContentType': 'application/x-www-form-urlencoded',
'Connection': 'keep-alive',
'Accept': '*/*',
'Referer': 'https://www.instagram.com',
'authority': 'www.instagram.com',
'Host': 'www.instagram.com',
'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4',
'Accept-Encoding': 'gzip, deflate'
})
# get csrftoken and the instagram main page
r = sess.get('https://www.instagram.com/')
sess.headers.update({'X-CSRFToken': r.cookies.get_dict()['csrftoken']})
# log in
data = {'username': username, 'password': password}
r = sess.post('https://www.instagram.com/accounts/login/ajax/', data=data, allow_redirects=True)
token = r.cookies.get_dict()['csrftoken']
sess.headers.update({'X-CSRFToken': token})
# parse the response from the log in
data = json.loads(r.text)
print(data)
if data['status'] == 'fail':
return None
if data['authenticated']:
return True
else:
return False
login("username", "password")
the problem is that almost every time i have tried to run this
it didnt work and threw an exception:
Traceback (most recent call last):
File "main.py", line 156, in <module>
main()
File "main.py", line 152, in main
brute_force(username, pass_file_path)
File "main.py", line 114, in brute_force
logged_in = login(username, password)
File "main.py", line 81, in login
sess.headers.update({'X-CSRFToken': r.cookies.get_dict()['csrftoken']})
KeyError: 'csrftoken'
and sometimes it threw this exception:
File "main.py", line 94, in login
if data['authenticated']:
KeyError: 'authenticated'
how can i fix this?
i tried restarting tor
changing its configs
but nothing works
please help if you can!
It appears that Instagram doesn't set cookies for tor users:
>>> s = your_setup_code_for_session()
>>> r = s.get('https://www.instagram.com')
>>> r.cookies.get_dict()
{}
I also tested this using the tor browser and got the same results:
It looks like you'll need to use a vpn or a Tor + vpn combination.

Scrapy throuws Exception "raise _DefGen_Return(val) twisted.internet.defer._DefGen_Return: "

When I run the code locally (windows 10) everything works fine.
Have checked other answers here and other resources, but failed to figure out any solution.
After deploying to ScrapingHub Im getting this error message:
[scrapy.core.scraper] Spider error processing <POST http://oris.co.palm-beach.fl.us/or_web1/new_sch.asp> (referer: http://oris.co.palm-beach.fl.us/or_web1/) Less
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 1299, in _inlineCallbacks
result = g.send(result)
File "/usr/local/lib/python3.6/site-packages/scrapy/core/downloader/middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 1276, in returnValue
raise _DefGen_Return(val)
twisted.internet.defer._DefGen_Return: <200 http://oris.co.palm-beach.fl.us/or_web1/new_sch.asp>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 42, in process_spider_input
result = method(response=response, spider=spider)
File "/usr/local/lib/python3.6/site-packages/scrapy_pagestorage.py", line 68, in process_spider_input
self.save_response(response, spider)
File "/usr/local/lib/python3.6/site-packages/scrapy_pagestorage.py", line 102, in save_response
self._writer.write(payload)
File "/usr/local/lib/python3.6/site-packages/scrapinghub/hubstorage/batchuploader.py", line 224, in write
data = jsonencode(item)
File "/usr/local/lib/python3.6/site-packages/scrapinghub/hubstorage/serialization.py", line 38, in jsonencode
return dumps(o, default=jsondefault)
File "/usr/local/lib/python3.6/json/__init__.py", line 238, in dumps
**kw).encode(obj)
File "/usr/local/lib/python3.6/json/encoder.py", line 199, in encode
chunks = self.iterencode(o, _one_shot=True)
File "/usr/local/lib/python3.6/json/encoder.py", line 257, in iterencode
return _iterencode(o, 0)
TypeError: keys must be a string
Here is a snippet of my Scrapy function that throws this error
The ToDate and FromDate are passed as arguments to spider:
start_urls = ['http://oris.co.palm-beach.fl.us/or_web1/']
def parse(self, response):
# inspect_response(response, self)
url = 'http://oris.co.palm-beach.fl.us/or_web1/new_sch.asp'
headers = {
'upgrade-insecure-requests': "1",
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
'origin': "http://oris.co.palm-beach.fl.us",
'content-type': "application/x-www-form-urlencoded",
'dnt': "1",
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
'cache-control': "no-cache",
}
# Date range should be visin 90 days
data = {'FromDate': self.FromDate,
'PageSize': '500',
'RecSetSize': '500',
'ToDate': self.ToDate,
'consideration': '',
'search_by': 'DocType',
'search_entry': 'LP'}
body = urlencode(data)
yield scrapy.Request(url, method="POST", headers = headers, body = body, callback = self.parsed)
def parsed(self, response):
# inspect_response(response, self)
# Getting all View urls.
urls=response.xpath("//a[#class = 'list_2']/#href").extract()
for url in urls:
url = url.replace('\r', '').replace('\t','').replace('\n','')
url = url.replace('\r', '').replace('\t','').replace('\n','')
url = response.urljoin(url)
url = url.replace('details.asp','details_des.asp') + '&linked=&party_seq='
yield scrapy.Request(url, callback = self.details)
ok. the issue was with - Messagepack is not available ( this was in the Debug log, not in the errors though., and page storage enabled for this project.
I have disabled pagestorage and it works fine now.
I wished Error messages were more readable in Scrapy and SH.

Python3 requests post correctly but get nothing(but by browser is ok)

When I visit 'https://baike.baidu.com/wikitag/taglist?tagId=75953' on chrome,through fiddler I find the browser sends a post request to 'https://baike.baidu.com//wikitag/api/getlemmas'.
So I'm trying to send a 'POST' request with form data to the url:'https://baike.baidu.com//wikitag/api/getlemmas' and get the JSON data from its 'response' request.
I get all the headers and form data through the Fiddler and try to send the same 'POST' request by python3 using requests package.
But even I send the 'POST' request with the same headers and form data, I get the request(status:200) with an empty body.
the same request I send by 'postman' is also all right, but by python3 I failed anyway.
# -*- coding:UTF-8 -*-
import requests
def disease_json():
host = 'https://baike.baidu.com'
target = host + '/wikitag/api/getlemmas'
cookies={
'BAIDUID':'EEE35ACB030447144E615B191397065B:FG=1;PSTM=1523192637;BIDUPSID=B34DD366905D15BB907C1667346970AE;Hm_lvt_55b574651fcae74b0a9f1cf9c8d7c93a=1522304864,1522305101,1523192946,1523253565;PSINO=2;H_PS_PSSID=1990_1438_26082_21 125_22074;BDORZ=B490B5EBF6F3CD402E515D22BCDA1598'
}
headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
'Content-Length':'91',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Referer':'https://baike.baidu.com/wikitag/taglist?tagId=75953',
'Origin':'https://baike.baidu.com',
'Connection':'keep-alive',
'Host':'baike.baidu.com',
}
forms = {
'limit': '24',
'timeout': '3000',
'filterTags':'[]',
'tagID': '75953',
'fromLemma': 'false',
'contentLength': '40',
'page': '0',
}
req=requests.post(url=target,data=forms,verify=False,headers=headers)
print(req.text)
"""
html = json.loads(req.text)
for each in html['lemmaList']:
print('lemmaCroppedTitle:',each['lemmaCroppedTitle'])
print(req.text)
"""
def main():
disease_json()
if __name__ == '__main__':
main()
Following is the correct request sent by browser:
Modified content-type and your request payload. Also added method encode_multipart_data for payload transformation to be consistent with multipart-form-data
import sys
import requests
def encode_multipart_data(fields):
boundary = '------WebKitFormBoundary7MA4YWxkTrZu0gW'
CRLF = '\r\n'
L = []
for key, value in fields.items():
L.append(boundary)
L.append('Content-Disposition: form-data; name="%s"\r\n' % key)
L.append(value)
L.append(boundary + "--")
body = CRLF.join(L)
return body
def disease_json():
host = 'https://baike.baidu.com'
target = host + '/wikitag/api/getlemmas'
headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
# changed content-type
'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW",
'Referer':'https://baike.baidu.com/wikitag/taglist?tagId=75953',
'Origin':'https://baike.baidu.com',
'Connection':'keep-alive',
'Host':'baike.baidu.com'
}
forms = {
'limit': '24',
'timeout': '3000',
'filterTags': '[]',
'tagId': '75953',
'fromLemma': 'false',
'contentLength': '40',
'page': '0',
}
payload = encode_multipart_data(forms)
resp = requests.post(url=target, data=payload, headers=headers)
print(resp.text)
if __name__ == '__main__':
disease_json()
This way can also solve the problem.
import requests
import http.cookiejar
import json
url = "https://baike.baidu.com/wikitag/api/getlemmas"
payload = "limit=24&timeout=3000&filtetTags=%5B%5D&tagId=75953&fromLemma=false&contentLegth=40&page=0"
headers = {
'Content-Type': "application/x-www-form-urlencoded",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181\
Safari/537.36"
}
def get_cookies():
session = requests.Session()
session.cookies = http.cookiejar.LWPCookieJar("cookie")
response = session.post(url, headers=headers, data=payload, allow_redirects=False,verify=False)
session.cookies.save(ignore_discard=True, ignore_expires=True)
return response
def disease_json(times=-1):
times += 1
response = get_cookies()
if response.status_code == 302:
session = requests.session()
session.cookies = http.cookiejar.LWPCookieJar(filename='cookie')
session.cookies.load(ignore_discard=True)
url = response.headers['Location']
response = session.post(url, headers=headers, data=payload, allow_redirects=False)
json_data = response.text
print(json.loads(json_data))
print(times)
if __name__ == '__main__':
disease_json()

Resources