Changing Scrapy/Splash user agent - python-3.x

How can I set the user agent for Scrapy with Splash in an equivalent way like below:
import requests
from bs4 import BeautifulSoup
ua = {"User-Agent":"Mozilla/5.0"}
url = ""
page = requests.get(url, headers=ua)
soup = BeautifulSoup(page.text, "lxml")
My spider would look similar to this:
import scrapy
from scrapy_splash import SplashRequest
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = [""]
start_urls = [""]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(
args={'wait': 0.5}

You need to set user_agent attribute to override default user agent:
class ExampleSpider(scrapy.Spider):
name = 'example'
user_agent = 'Mozilla/5.0'
In this case UserAgentMiddleware (which is enabled by default) will override USER_AGENT setting value to 'Mozilla/5.0'.
You can also override headers per request:
scrapy_splash.SplashRequest(url, headers={'User-Agent': custom_user_agent})

The proper way is to to alter the splash script to included it... no add it to the spider though, if it works as well.

If you use pure splash (not scrapy-splash package), you can just pass headers param with 'User-Agent' key. And the requests on this page all will use this user-agent.
Here is an example:
import requests
import json
headers = {
'User-Agent': 'Mozilla/5.0',
param = {
'url': your_aim_url,
'headers': headers,
'html': 1,
'har': 1,
'response_body': 1,
session = requests.Session()
session.headers.update({'Content-Type': 'application/json'})
response ='', json=param)
response_json = json.loads(response.text, encoding='utf-8')
print(response_json.get('html')) # page html
print(response_json.get('har')) # har with respose body. if do not want respose body, set 'response_body' to 0
You can check the request header in har to see if the user-agent is correct.


Why Is JSON Truncated During Linux HTML Response Parsing?

import requests
from bs4 import BeautifulSoup
url = ""
headers = {
'User-Agent': "PostmanRuntime/7.15.2",
response = requests.request("GET", url, headers=headers)
r = response.text
soup = BeautifulSoup(response.text, "lxml")
textarea = soup.find('textarea', attrs={'id': 'song-list-pre-data'}).get_text()
In the Linux environment, the matching result JSON is truncated.
the textarea :xxxxxx ee":0,"album":{"id":158052587,"name":"Sakana~( ˵>ㅿㅿ
I think it's probably because of the special symbols.
How do you deal with this situation?
You need to convert from string to JSON object list
then it can be print a song.
I tested Ubuntu 20.04 and Windows on VS code terminal.
Both are works.
import requests
import json
from bs4 import BeautifulSoup
url = ""
headers = {
'User-Agent': "PostmanRuntime/7.15.2",
response = requests.request("GET", url, headers=headers)
soup = BeautifulSoup(response.text)
textarea = soup.find('textarea', attrs={'id': 'song-list-pre-data'}).get_text()
json_list = json.loads(textarea)
for song in json_list:
print("album:", song['album']['name'], ", artists: ", song['artists'][0]['name'], "duration: ", song['duration'])
Result on Ubuntu 20.04
Result on VS code Terminal at

Download Specific file from Website with BeautifulSoup

Following the documentation of BeautifulSoup, I am trying to download a specific file from a webpage. First trying to find the link that contains the file name:
import re
import requests
from bs4 import BeautifulSoup
url = requests.get("")
parsed = BeautifulSoup(url.text, "html.parser")
link = parsed.find("a", text=re.compile("TASA_DOLAR_REFERENCIA_MC.xls"))
path = link.get('href')
But with no success. Then trying to print every link on that page, I get no links:
import re
import requests
from bs4 import BeautifulSoup
url = requests.get("")
parsed = BeautifulSoup(url.text, "html.parser")
link = parsed.find_all('a')
for links in parsed.find_all("a href"):
print(links.get('a href'))
It looks like the url of the file is dynamic, it adds a ?v=123456789 parameter to the end of the url, like the file version, that's why I need to download the file using the file name.
Actually you are dealing with a dynamic JavaScript page which is fully loaded via an XHR request to the following url once the page loads.
Below is a direct call to the back-end API which identify the request using page id which is 2538 and then we can load your desired url.
import requests
from bs4 import BeautifulSoup
def main(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0'
with requests.Session() as req:
data = {
"id": "2538",
"languageName": "es"
r =, data=data)
soup = BeautifulSoup(r.json()['result']['article']['content'], 'lxml')
target = soup.select_one('a[href*=TASA_DOLAR_REFERENCIA_MC]')['href']
r = req.get(target)
with open('data.xls', 'wb') as f:
if __name__ == "__main__":

Attempting login with Scrapy-Splash

Since i am not able to login to, i tried many different methods like selenium, which i successfully logged in, but didnt manage to start crawling.
Now i tried my luck with scrapy-splash, but i cant login :(
If i render the loginpage with splash, i see following picture:
Well, there should be a loginform, like username and password, but scrapy cant see it?
Im sitting here like a week in front of that loginform and losing my will to live..
My last question didnt even get one answer, now i try it again.
here is the html code of the login-form:
When i login manual, i get redirected to "/login?returnUrl=", where i only have these form_data:
My Code
# -*- coding: utf-8 -*-
import scrapy
from scrapy_splash import SplashRequest
from scrapy.spiders import CrawlSpider, Rule
from ..items import ScrapysplashItem
from scrapy.http import FormRequest, Request
import csv
class DuifSplash(CrawlSpider):
name = "duifsplash"
allowed_domains = ['']
login_page = ''
with open('duifonlylinks.csv', 'r') as f:
reader = csv.DictReader(f)
start_urls = [items['Link'] for items in reader]
def start_requests(self):
yield SplashRequest(
def parse(self, response):
return FormRequest.from_response(
'username' : 'not real',
'password' : 'login data',
}, callback=self.after_login)
def after_login(self, response):
accview = response.xpath('//div[#class="c-accountbox clearfix js-match-height"]/h3')
if accview:
for url in self.start_urls:
yield response.follow(url=url, callback=self.parse_page)
def parse_page(self, response):
productpage = response.xpath('//div[#class="product-details col-md-12"]')
if not productpage:
print('No productlink', response.url)
for a in productpage:
items = ScrapysplashItem()
items['SKU'] = response.xpath('//p[#class="desc"]/text()').get()
items['Title'] = response.xpath('//h1[#class="product-title"]/text()').get()
items['Link'] = response.url
items['Images'] = response.xpath('//div[#class="inner"]/img/#src').getall()
items['Stock'] = response.xpath('//div[#class="desc"]/ul/li/em/text()').getall()
items['Desc'] = response.xpath('//div[#class="item"]/p/text()').getall()
items['Title_small'] = response.xpath('//div[#class="left"]/p/text()').get()
items['Price'] = response.xpath('//div[#class="price"]/span/text()').get()
yield items
In my "prework", i crawled every internal link and saved it to a .csv-File, where i analyse which of the links are product links and which are not.
Now i wonder, if i open a link of my csv, it opens an authenticated session or not?
I cant find no cookies, this is also strange to me
I managed to login successfully :-) now i only need to know where the cookies are stored
Lua Script
function main(splash, args)
local title = splash.evaljs("document.title"),
return {
cookies = splash:get_cookies(),
I don't think using Splash here is the way to go, as even with a normal Request the form is there: response.xpath('//form[#id="login-form"]')
There are multiple forms available on the page, so you have to specify which form you want to base yourself on to make a FormRequest.from_response. Best specify the clickdata as well (so it goes to 'Login', not to 'forgot password'). In summary it would look something like this:
req = FormRequest.from_response(
'username' : 'not real',
'password' : 'login data'},
clickdata={'type': 'submit'}
If you don't use Splash, you don't have to worry about passing cookies - this is taken care of by Scrapy. Just make sure you don't put COOKIES_ENABLED=False in your

Unable to understand the 403 Error from HTML parsing using BeautifulSoup4 with Python3.x

I am in the Coursera Course Python For Everyone Course and I attempted one of the questions from the textbook:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = ''
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
print(tag.get('href', None))
I don't understand the error:
urllib.error.HTTPError: HTTP Error 403: Forbidden
But according to the full error, it starts #Line 18. From reading other SO and this Similar Question that it probably has something to do with the SSL certificate and how the website thinks I'm a bot.
Why doesn't the code work?
import requests
from bs4 import BeautifulSoup
url = ''
headers = requests.utils.default_headers()
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
Link = requests.get(url, headers=headers)
soup =BeautifulSoup(Link.content,"lxml")
# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
print(tag.get('href', None))
Updated code for urllib:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = ''
from urllib.request import Request, urlopen
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()
soup = BeautifulSoup(html, 'html.parser')
# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
print(tag.get('href', None))

Python3 requests.get ignoring part of my URL (BEAUTIFULSOUP + PYTHON WEBSCRAPING)

I'm using requests.get like so:
import urllib3
import requests
from bs4 import BeautifulSoup
cookies = {
'PYPF': '3OyMLS2-xJlxKilWEOSvMQXAhyCgIhvAxYfbB8S_5lGBxxAS18Z7I8Q',
'_ga': 'GA1.2.227320333.1496647453',
'_gat': '1',
'_gid': 'GA1.2.75815641.1496647453'
params = {
'platform': 'xbox'
page = requests.get("", headers={'Platform': 'Xbox'}, verify=False, cookies=cookies, params=params).text
soup = BeautifulSoup(page, 'html.parser')
... etc.
But, from my results in testing, it seems requests.get is ignoring '/#pf=xbox' in ''.
Is this because I am having to set verify to false? What is going on here?
