Python 3 - Add custom headers to urllib.request Request - python-3.x

In Python 3, the following code obtains the HTML source for a webpage.
import urllib.request
url = "https://docs.python.org/3.4/howto/urllib2.html"
response = urllib.request.urlopen(url)
response.read()
How can I add the following custom header to the request when using urllib.request?
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }

The request headers can be customized by first creating a request object then supplying it to urlopen.
import urllib.request
url = "https://docs.python.org/3.4/howto/urllib2.html"
hdr = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
req = urllib.request.Request(url, headers=hdr)
response = urllib.request.urlopen(req)
response.read()
Source: Python 3.4 Documentation

import urllib.request
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
response = urllib.request.urlopen("url")
response.read()
Should you wish to learn about the details you can refer to the python documentation: https://docs.python.org/3/library/urllib.request.html

#Using urllib.request, with urlopen, allows to open the specified URL.
#Headers can be included inside the urlopen along with the url.
from urllib.request import urlopen
url = "https://docs.python.org/3.4/howto/urllib2.html"
header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
response = urlopen(url, headers=header)
response.read()

Related

Get response 403 when i'm trying to crawling, user agent doesn't work in Python 3

I'm trying to crawling this website and get the message:
"You don't have permission to access"
there is a way to bypass this ? already used user agents and urlopen
Here is my code:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from urllib.request import Request, urlopen
url = 'https://www.oref.org.il/12481-he/Pakar.aspx'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'}
res = requests.get(url, headers=header)
soup = BeautifulSoup(res.content, 'html.parser')
print(res)
output:
<Response [403]>
also tried to do that:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'})
webpage = urlopen(req).read()
output:
HTTP Error 403: Forbidden
still blocked and get response 403, Anyone who can help?

WebScraping / Identical sites not working?

i would like to scrape the header-element from these both links -
For me this 2 sites look absolute identical - pics see below
Why is only the scraping for the second link working and not for the first?
import time
import requests
from bs4 import BeautifulSoup
# not working
link = "https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4"
page = requests.get (link)
time.sleep (1)
soup = BeautifulSoup (page.content, "html.parser")
erg = soup.find("header")
print(f"First Link: {erg}")
# working
link = "https://apps.apple.com/us/app/jackpot-boom-casino-slots/id1554995201?uo=4"
page = requests.get (link)
time.sleep (1)
soup = BeautifulSoup (page.content, "html.parser")
erg = soup.find("header")
print(f"Second Link: {len(erg)}")
Working:
Not Working:
The page is sometimes loaded by JavaScript, so request won't support it.
You can use a while loop to check if header appears in the soup and then break
import requests
from bs4 import BeautifulSoup
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}
link = "https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4"
while True:
soup = BeautifulSoup(requests.get(link).content, "html.parser")
header = soup.find("header")
if header:
break
print(header)
Try this to get whatever fields you wish to grab from those links. curently it fetches the title. You can modify res.json()['data'][0]['attributes']['name'] to grab any field of your interest. Mkae sure to put the urls within this list urls_to_scrape.
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote
urls_to_scrape = {
'https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4',
'https://apps.apple.com/us/app/jackpot-boom-casino-slots/id1554995201?uo=4'
}
base_url = 'https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4'
link = 'https://amp-api.apps.apple.com/v1/catalog/US/apps/{}'
params = {
'platform': 'web',
'additionalPlatforms': 'appletv,ipad,iphone,mac',
'extend': 'customPromotionalText,customScreenshotsByType,description,developerInfo,distributionKind,editorialVideo,fileSizeByDevice,messagesScreenshots,privacy,privacyPolicyText,privacyPolicyUrl,requirementsByDeviceFamily,supportURLForLanguage,versionHistory,websiteUrl',
'include': 'genres,developer,reviews,merchandised-in-apps,customers-also-bought-apps,developer-other-apps,app-bundles,top-in-apps,related-editorial-items',
'l': 'en-us',
'limit[merchandised-in-apps]': '20',
'omit[resource]': 'autos',
'sparseLimit[apps:related-editorial-items]': '5'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
res = s.get(base_url)
soup = BeautifulSoup(res.text,"lxml")
token_raw = soup.select_one("[name='web-experience-app/config/environment']").get("content")
token = json.loads(unquote(token_raw))['MEDIA_API']['token']
s.headers['Accept'] = 'application/json'
s.headers['Referer'] = 'https://apps.apple.com/'
s.headers['Authorization'] = f'Bearer {token}'
for url in urls_to_scrape:
id_ = url.split("/")[-1].strip("id").split("?")[0]
res = s.get(link.format(id_),params=params)
title = res.json()['data'][0]['attributes']['name']
print(title)

Scraping a Video from a website and downloading it as mp4

I am trying to scrape this webiste and download the soccer goal that is in it. I saw this Stackoverflow post and I tried the solution and it still does not work.
Here is the code I have done
import requests
from bs4 import BeautifulSoup
# specify the URL of the archive here
url = 'https://cdn-cf-east.streamable.com/video/mp4/g6f986.mp4p'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
with open('video.mp4', 'wb') as f_out:
r = requests.get(url, headers=headers, stream=True)
print(r)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f_out.write(chunk)
when I printed the request it showed me 403 and because of that the video will not open or get downloaded, any thoughts on how I can download the video?
Use this below url.
import requests
from bs4 import BeautifulSoup
# specify the URL of the archive here
url = 'https://cdn-cf-east.streamable.com/video/mp4/g6f986.mp4?Expires=1621994280&Signature=IqySuJxyVi9pCmC~JUhl-iyp-LmG6OiAFfQeu-~-a55osCfu9VrksEhzaQzJlMxAHcSt1R4j9Pt-G8sblQeFt3UtGqY-neHJkC4mUxuHjxGWAWdksyiAxkMb8DYRLkvIseUfkbKbeO6Dt807QwMkspFmXYdzljm8DLho6nMQfC--jtfy8B2gONhA9YUmK2o~fUHwTHzTXXqNGct2hQl-B9cFLDBdj8LXWTj-75YInwWxLwtoenKK~qLahGtJXKXvxTVltxMvUYXXvP9F~WfhNIhNqns1JKrrrqJ~N1XunZHCv~IVJyzOEvrn2G4J5LMIn~dcEZ9frV3APHsE4D~HQA__&Key-Pair-Id=APKAIEYUVEN4EVB2OKEQ'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
with open('video.mp4', 'wb') as f_out:
r = requests.get(url, headers=headers, stream=True)
print(r)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f_out.write(chunk)
This is how you extract the video link from Streamable if you want to in a programmatic way
import requests
from bs4 import BeautifulSoup
# specify the URL of the archive here
url = "https://streamable.com/a50s3e"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
# Pull all text from the BodyText div
video = soup.find_all('video')
temp = video[0].get("src")
url = "https:"+temp
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
with open('video.mp4', 'wb') as f_out:
r = requests.get(url, headers=headers, stream=True)
print(r)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f_out.write(chunk)

Web-scraping Yell.com in Python

After reading a LOT, I have tried to do my first step in web scraping at yell website with urllib and requests but I get the same in both cases (404 not found).
The url is:
url = https://www.yell.com/
What I have tried:
urllib package
import urllib.request
f = urllib.request.urlopen(url)
print(f.read(100))
and
import urllib.request
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
opener.open(url)
requests package
url = 'www.yell.com'
response = requests.get(url)
and
headers = {'Accept': 'text/html'}
response = requests.get(url, headers=headers)
But i reach to the 404 error.
Try this using urllib
import urllib.request
url = 'https://www.yell.com/'
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
print(response.read())
I would suggest you to use requests + beautifulsoup4
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
it will make your scraping life easier
#You can use also selenium to avoid http errors
from selenium import webdriver
from bs4 import BeautifulSoup
import urllib.request
main_url = 'https://www.yell.com/'
driver = webdriver.Chrome(r'write chromedriver path')
driver.get(main_url)
res = driver.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(res, 'html.parser')
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
request = urllib.request.Request(main_url, headers=headers)
response = urllib.request.urlopen(request)
print(response.read())

how to extract links and handle the page as it is loading again and again using python beautifulsoup

tring to extract links and want to handle loading. but not the links even.
code:
from bs4 import BeautifulSoup
import requests
r = requests.get('http://www.indiabusinessguide.in/business-categories/agriculture/agricultural-equipment.html')
soup = BeautifulSoup(r.text,'lxml')
links = soup.find_all('a',class_='link_orange')
for link in links:
print(link['href'])
please help me to handle this loading and extraction of links.
Try using the lxml library. Response is received by posting a request to the url using Requests.
import requests
import lxml
from lxml import html
contact_list = []
def scrape(url, pages):
for page in range(1, pages):
headers = {
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
"Cookie": "PHPSESSID=2q0tk3fi1kid0gbdfboh94ed56",
}
data = {
"page": f"{page}"
}
r = requests.post(url, headers=headers, data=data)
tree = html.fromstring(r.content)
links= tree.xpath('//a[#class="link_orange"]')
for link in links:
# print(link.get('href'))
contact_list.append(link.get('href'))
url = "http://www.indiabusinessguide.in/ajax_advertiselist.php"
scrape(url, 10)
print(contact_list)
print(len(contact_list))

Resources