Why i have HTTP Error 503 using urllib and BS4? - python-3.x

I use BS4 to get Browse Standards by Technology from website: https://standards.globalspec.com/
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "https://standards.globalspec.com/"
q1 = urlopen(url)
soup = BeautifulSoup(q1, 'lxml')
print(soup)
But i have an error: urllib.error.HTTPError: HTTP Error 503: Service Temporarily Unavailable
Could anyone see what could be causing this error?

#Samt94 already has stated that the website is under cloudflare protection. So you can use cloudscraper instead of requests
from bs4 import BeautifulSoup
import cloudscraper
scraper = cloudscraper.create_scraper(delay=10, browser={'custom': 'ScraperBot/1.0',})
url = 'https://standards.globalspec.com/'
req = scraper.get(url)
print(req)
soup = BeautifulSoup(req.text,'lxml')
Output:
<Response [200]>
cloudscraper

You can use CloudScraper to access websites that use CloudFlare DDoS Protection:
from bs4 import BeautifulSoup
import cloudscraper
url = "https://standards.globalspec.com/"
scraper = cloudscraper.create_scraper()
q1 = scraper.get(url)
soup = BeautifulSoup(q1.text, 'lxml')
print(soup)

Related

Is there any way to get all request url

import mechanize
import json
import re
from bs4 import BeautifulSoup
url = "https://app.propertymeld.com/login/?next=/"
browser = mechanize.Browser()
browser.open(url)
browser.select_form(nr = 0)
browser.form['email'] = "aaa"
browser.form['password'] = "bbb"
result = browser.submit().read()
print(result)
soup = BeautifulSoup(result,"html.parser")
print(soup)
Is there any method to get request url from the website. After the website login the url is been hidden. Only javascript is available.

How to implement Confluence Forum Authorization into CrawlSpider

I want to Crawl a Confluence Forum that requieres Authentication aswell as Authorization. After long research i somehow did manage to make a script that does exactly that but i´m kinda lost in how to implement Auth in front of the CrawlSpider including the Header with Auth-Token so the Spider is logged in and is able to Crawl all accessable Subsites.
Here´s my double Auth Script:
from urllib import response
from atlassian import Confluence
import subprocess
from urllib.parse import urlparse
import atlassian
from bs4 import BeautifulSoup
import getpass
import requests
import logging
from PyQt6.QtGui import *
from PyQt6.QtCore import *
from lxml import html
from selenium import webdriver
from selenium.webdriver.common.by import By
logging.basicConfig(filename='conf_connect.log', filemode='w', level=logging.DEBUG)
global url
url='http://MYCONFL.atlassian.net/wiki/'
confluence = Confluence(
url='http://MYCONFL.atlassian.net/wiki/',
username='USERNAME',
password='PASSWORD')
with requests.session() as s:
auth = ('USERNAME', getpass.getpass())
session = requests.session()
session.verify = True;
session.auth = auth;
headers = {
'Authorization': 'Basic base64encodedauthtoken',
'Content-Type': 'application/json'}
#storing response
response = requests.get(url)
if response.status_code != 200:
print("Error - Something went wrong!")
else:
print("all good, Code 200")
Here is the CrawlSpider´s Code:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import tldextract
url_list = [
'https://www.MYCONFL.atlassian.com/wiki/'
]
class CrawlySpider(CrawlSpider):
name = 'crawly'
allowed_domains = ['MYCONFL.atlassian.com']
start_urls = url_list
rules = (
# Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
Rule(LinkExtractor(), callback='parse_page', follow=True),
)
def parse_page(self, response):
html = response.body
soup = BeautifulSoup(html, 'lxml')
text = soup.get_text()
domain = tldextract.extract(response.request.url)[1]
path = urlparse(response.request.url)[2].replace("/", "")
with open(f'{domain}.txt', 'a') as fp:
fp.write(text)
with open(f'{domain} {path}.html', 'wb') as fp:
fp.write(response.body)
Thanks in Advance for Advice :D

how to fix beautifulsoup ssl CERTIFICATE_VERIFY_FAILED error

Code:
import requests
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
html = urlopen("https://www.familyeducation.com/baby-names/browse-origin/surname/german")
soup = BeautifulSoup(html)
metadata=soup.find_all('meta')
Error:
urlopen error [SSL: CERTIFICATE_VERIFY_FAILED]
For this error check out this answer:
urllib and "SSL: CERTIFICATE_VERIFY_FAILED" Error
But you don't need urlopen for html request always. You can also send the request through requests lib. Try this one:
import requests
from bs4 import BeautifulSoup
html = requests.get("https://www.familyeducation.com/baby-names/browse-origin/surname/german")
soup = BeautifulSoup(html.text, "html.parser")
metadata = soup.find_all('meta')

Unable to understand the 403 Error from HTML parsing using BeautifulSoup4 with Python3.x

I am in the Coursera Course Python For Everyone Course and I attempted one of the questions from the textbook:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = 'https://www.py4e.com/book.htm'
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
print(tag.get('href', None))
I don't understand the error:
urllib.error.HTTPError: HTTP Error 403: Forbidden
But according to the full error, it starts #Line 18. From reading other SO and this Similar Question that it probably has something to do with the SSL certificate and how the website thinks I'm a bot.
Why doesn't the code work?
import requests
from bs4 import BeautifulSoup
url = 'https://www.py4e.com/book.htm'
headers = requests.utils.default_headers()
headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})
Link = requests.get(url, headers=headers)
soup =BeautifulSoup(Link.content,"lxml")
# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
print(tag.get('href', None))
Output:
http://amzn.to/1KkULF3
book/index.htm
http://amzn.to/1KkULF3
http://amzn.to/1hLcoBy
http://amzn.to/1KkV42z
http://amzn.to/1fNOnbd
http://amzn.to/1N74xLt
http://do1.dr-chuck.net/py4inf/EN-us/book.pdf
http://do1.dr-chuck.net/py4inf/ES-es/book.pdf
https://twitter.com/fertardio
translations/KO/book_009_ko.pdf
http://www.xwmooc.net/python/
http://fanwscu.gitbooks.io/py4inf-zh-cn/
book_270.epub
translations/ES/book_272_es4.epub
https://www.gitbook.com/download/epub/book/fanwscu/py4inf-zh-cn
html-270/
html_270.zip
http://itunes.apple.com/us/book/python-for-informatics/id554638579?mt=13
http://www-personal.umich.edu/~csev/books/py4inf/ibooks//python_for_informatics.
ibooks
http://www.py4inf.com/code
http://www.greenteapress.com/thinkpython/thinkCSpy/
http://allendowney.com/
Updated code for urllib:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = 'https://www.py4e.com/book.htm'
from urllib.request import Request, urlopen
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()
soup = BeautifulSoup(html, 'html.parser')
# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
print(tag.get('href', None))

Python3 requests.get ignoring part of my URL (BEAUTIFULSOUP + PYTHON WEBSCRAPING)

I'm using requests.get like so:
import urllib3
import requests
from bs4 import BeautifulSoup
urllib3.disable_warnings()
cookies = {
'PYPF': '3OyMLS2-xJlxKilWEOSvMQXAhyCgIhvAxYfbB8S_5lGBxxAS18Z7I8Q',
'_ga': 'GA1.2.227320333.1496647453',
'_gat': '1',
'_gid': 'GA1.2.75815641.1496647453'
}
params = {
'platform': 'xbox'
}
page = requests.get("http://www.rl-trades.com/#pf=xbox", headers={'Platform': 'Xbox'}, verify=False, cookies=cookies, params=params).text
page
soup = BeautifulSoup(page, 'html.parser')
... etc.
But, from my results in testing, it seems requests.get is ignoring '/#pf=xbox' in 'http://www.rl-trades.com/#pf=xbox'.
Is this because I am having to set verify to false? What is going on here?

Resources