How to implement Confluence Forum Authorization into CrawlSpider - python-3.x

I want to Crawl a Confluence Forum that requieres Authentication aswell as Authorization. After long research i somehow did manage to make a script that does exactly that but i´m kinda lost in how to implement Auth in front of the CrawlSpider including the Header with Auth-Token so the Spider is logged in and is able to Crawl all accessable Subsites.
Here´s my double Auth Script:
from urllib import response
from atlassian import Confluence
import subprocess
from urllib.parse import urlparse
import atlassian
from bs4 import BeautifulSoup
import getpass
import requests
import logging
from PyQt6.QtGui import *
from PyQt6.QtCore import *
from lxml import html
from selenium import webdriver
from selenium.webdriver.common.by import By
logging.basicConfig(filename='conf_connect.log', filemode='w', level=logging.DEBUG)
global url
url='http://MYCONFL.atlassian.net/wiki/'
confluence = Confluence(
url='http://MYCONFL.atlassian.net/wiki/',
username='USERNAME',
password='PASSWORD')
with requests.session() as s:
auth = ('USERNAME', getpass.getpass())
session = requests.session()
session.verify = True;
session.auth = auth;
headers = {
'Authorization': 'Basic base64encodedauthtoken',
'Content-Type': 'application/json'}
#storing response
response = requests.get(url)
if response.status_code != 200:
print("Error - Something went wrong!")
else:
print("all good, Code 200")
Here is the CrawlSpider´s Code:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import tldextract
url_list = [
'https://www.MYCONFL.atlassian.com/wiki/'
]
class CrawlySpider(CrawlSpider):
name = 'crawly'
allowed_domains = ['MYCONFL.atlassian.com']
start_urls = url_list
rules = (
# Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
Rule(LinkExtractor(), callback='parse_page', follow=True),
)
def parse_page(self, response):
html = response.body
soup = BeautifulSoup(html, 'lxml')
text = soup.get_text()
domain = tldextract.extract(response.request.url)[1]
path = urlparse(response.request.url)[2].replace("/", "")
with open(f'{domain}.txt', 'a') as fp:
fp.write(text)
with open(f'{domain} {path}.html', 'wb') as fp:
fp.write(response.body)
Thanks in Advance for Advice :D

Related

Is there any way to get all request url

import mechanize
import json
import re
from bs4 import BeautifulSoup
url = "https://app.propertymeld.com/login/?next=/"
browser = mechanize.Browser()
browser.open(url)
browser.select_form(nr = 0)
browser.form['email'] = "aaa"
browser.form['password'] = "bbb"
result = browser.submit().read()
print(result)
soup = BeautifulSoup(result,"html.parser")
print(soup)
Is there any method to get request url from the website. After the website login the url is been hidden. Only javascript is available.

Why i have HTTP Error 503 using urllib and BS4?

I use BS4 to get Browse Standards by Technology from website: https://standards.globalspec.com/
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "https://standards.globalspec.com/"
q1 = urlopen(url)
soup = BeautifulSoup(q1, 'lxml')
print(soup)
But i have an error: urllib.error.HTTPError: HTTP Error 503: Service Temporarily Unavailable
Could anyone see what could be causing this error?
#Samt94 already has stated that the website is under cloudflare protection. So you can use cloudscraper instead of requests
from bs4 import BeautifulSoup
import cloudscraper
scraper = cloudscraper.create_scraper(delay=10, browser={'custom': 'ScraperBot/1.0',})
url = 'https://standards.globalspec.com/'
req = scraper.get(url)
print(req)
soup = BeautifulSoup(req.text,'lxml')
Output:
<Response [200]>
cloudscraper
You can use CloudScraper to access websites that use CloudFlare DDoS Protection:
from bs4 import BeautifulSoup
import cloudscraper
url = "https://standards.globalspec.com/"
scraper = cloudscraper.create_scraper()
q1 = scraper.get(url)
soup = BeautifulSoup(q1.text, 'lxml')
print(soup)

how to fix beautifulsoup ssl CERTIFICATE_VERIFY_FAILED error

Code:
import requests
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
html = urlopen("https://www.familyeducation.com/baby-names/browse-origin/surname/german")
soup = BeautifulSoup(html)
metadata=soup.find_all('meta')
Error:
urlopen error [SSL: CERTIFICATE_VERIFY_FAILED]
For this error check out this answer:
urllib and "SSL: CERTIFICATE_VERIFY_FAILED" Error
But you don't need urlopen for html request always. You can also send the request through requests lib. Try this one:
import requests
from bs4 import BeautifulSoup
html = requests.get("https://www.familyeducation.com/baby-names/browse-origin/surname/german")
soup = BeautifulSoup(html.text, "html.parser")
metadata = soup.find_all('meta')

Unable to understand the 403 Error from HTML parsing using BeautifulSoup4 with Python3.x

I am in the Coursera Course Python For Everyone Course and I attempted one of the questions from the textbook:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = 'https://www.py4e.com/book.htm'
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
print(tag.get('href', None))
I don't understand the error:
urllib.error.HTTPError: HTTP Error 403: Forbidden
But according to the full error, it starts #Line 18. From reading other SO and this Similar Question that it probably has something to do with the SSL certificate and how the website thinks I'm a bot.
Why doesn't the code work?
import requests
from bs4 import BeautifulSoup
url = 'https://www.py4e.com/book.htm'
headers = requests.utils.default_headers()
headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})
Link = requests.get(url, headers=headers)
soup =BeautifulSoup(Link.content,"lxml")
# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
print(tag.get('href', None))
Output:
http://amzn.to/1KkULF3
book/index.htm
http://amzn.to/1KkULF3
http://amzn.to/1hLcoBy
http://amzn.to/1KkV42z
http://amzn.to/1fNOnbd
http://amzn.to/1N74xLt
http://do1.dr-chuck.net/py4inf/EN-us/book.pdf
http://do1.dr-chuck.net/py4inf/ES-es/book.pdf
https://twitter.com/fertardio
translations/KO/book_009_ko.pdf
http://www.xwmooc.net/python/
http://fanwscu.gitbooks.io/py4inf-zh-cn/
book_270.epub
translations/ES/book_272_es4.epub
https://www.gitbook.com/download/epub/book/fanwscu/py4inf-zh-cn
html-270/
html_270.zip
http://itunes.apple.com/us/book/python-for-informatics/id554638579?mt=13
http://www-personal.umich.edu/~csev/books/py4inf/ibooks//python_for_informatics.
ibooks
http://www.py4inf.com/code
http://www.greenteapress.com/thinkpython/thinkCSpy/
http://allendowney.com/
Updated code for urllib:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = 'https://www.py4e.com/book.htm'
from urllib.request import Request, urlopen
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()
soup = BeautifulSoup(html, 'html.parser')
# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
print(tag.get('href', None))

Python3 requests.get ignoring part of my URL (BEAUTIFULSOUP + PYTHON WEBSCRAPING)

I'm using requests.get like so:
import urllib3
import requests
from bs4 import BeautifulSoup
urllib3.disable_warnings()
cookies = {
'PYPF': '3OyMLS2-xJlxKilWEOSvMQXAhyCgIhvAxYfbB8S_5lGBxxAS18Z7I8Q',
'_ga': 'GA1.2.227320333.1496647453',
'_gat': '1',
'_gid': 'GA1.2.75815641.1496647453'
}
params = {
'platform': 'xbox'
}
page = requests.get("http://www.rl-trades.com/#pf=xbox", headers={'Platform': 'Xbox'}, verify=False, cookies=cookies, params=params).text
page
soup = BeautifulSoup(page, 'html.parser')
... etc.
But, from my results in testing, it seems requests.get is ignoring '/#pf=xbox' in 'http://www.rl-trades.com/#pf=xbox'.
Is this because I am having to set verify to false? What is going on here?

Resources