How to add multithreading in the Spider Crawler in python? - multithreading

I have written a Spider Crawler using BeautifulSoup and urllib2. This parses all the links upto 2 levels and collects all the html pages in a list. I tried making it multithreaded to add a little pace to the Spidering process however couldn't figure out where to begin ?
Below is the code.
#
#!/usr/bin/python
from bs4 import BeautifulSoup
import time
import urllib2
import sys
masterList = []
masterList1 = []
htmlList = []
url = "http://www.securitytube.net"
dictList = []
def spidy(url):
try:
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
if soup:
for links in soup.findAll('a',href=True):
ele = links['href']
if ".html" in ele and "http://" in ele:
htmlList.append(ele)
print ele
elif ".html" in ele and "https://" in ele:
htmlList.append(ele)
print ele
else:
masterList.append(ele)
for ele in masterList:
if 'mailto:' in ele:
masterList.remove(ele)
except:
print "url %s is not accessible ... Moving on to the next URL .."%(url)
pass
def level():
masterList1 = list(set(masterList))
for url1 in masterList1:
print "Running Spidy on : %s"%(url1)
print "\n########################################################\n"
spidy(url1)
print "\n########################################################\n"
masterList1.remove(url1)
masterList.remove(url1)
def main():
spidy("http://www.securitytube.net")
level()
level()
print "\n\n\n\n\n********************************************************************"
print htmlList

Have you ever heard of Scrapy?
Its super easy to create a simple spider with this and people have been developing it for years. It is not exactly multithreaded but it uses Twisted on the background so it is completely asynchronous and event-based.

Related

Runtime Request URL change not working scrapy

I have written a script in Python using Scrapy. The code runs to fetch all the pages that exist containing the code. It works fine on the first page load when scrapy is started and as per the script logic gets us page no. 2. But after loading page 2 I am unable to get xpath of the new page loaded so I can move ahead this way and get all the web-page numbers.
Sharing the code snippet.
import scrapy
from scrapy import Spider
class PostsSpider(Spider):
name = "posts"
start_urls = [
'https://www.boston.com/category/news/'
]
def parse(self, response):
print("first time")
print(response)
results = response.xpath("//*[contains(#id, 'load-more')]/#data-next-page").extract_first()
print(results)
if results is not None:
for result in results:
page_number = 'page/' + result
new_url = self.start_urls[0] + page_number
print(new_url)
yield scrapy.Request(url=new_url, callback=self.parse)
else:
print("last page")
It is because the page doesn't create new get requests when it loads the next page, it makes an ajax call to an api that returns json.
I made some adjustments to your code so it should work properly now. I am assuming that there is something other than the next page number you are trying to extract from each page, so I wrapped the html string into a scrapy.Slector class so you can use Xpath and such on it. This script will crawl alot of pages really fast, so you might want to adjust your settings to slow it down too.
import scrapy
from scrapy import Spider
from scrapy.selector import Selector
class PostsSpider(Spider):
name = "posts"
ajaxurl = "https://www.boston.com/wp-json/boston/v1/load-more?taxonomy=category&term_id=779&search_query=&author=&orderby=&page=%s&_wpnonce=f43ab1aae4&ad_count=4&redundant_ids=25129871,25130264,25129873,25129799,25128140,25126233,25122755,25121853,25124456,25129584,25128656,25123311,25128423,25128100,25127934,25127250,25126228,25126222"
start_urls = [
'https://www.boston.com/category/news/'
]
def parse(self, response):
new_url = None
try:
json_result = response.json()
html = json_result['data']['html']
selector = Selector(text=html, type="html")
# ... do some xpath stuff with selector.xpath.....
new_url = self.ajaxurl % json_result["data"]["nextPage"]
except:
results = response.xpath("//*[contains(#id, 'load-more')]/#data-next-page").extract_first()
if results is not None:
for result in results:
new_url = self.ajaxurl % result
if new_url:
print(new_url)
yield scrapy.Request(url=new_url, callback=self.parse)

Trying to scrape multiple pages sequentially using loop to change url

First off, sorry... I am sure that this is a common problem but i did not find the solution anywhere eventhough i searched for a while.
I am trying to create list by scraping data from classicdb. The two problems i have is.
The scraping as written in the try loop does not work in inside the for loop but on its own it works. Currently it just returns the 0 even though there should be values to return.
The output that i get from the try loop gernerates new lists but i want to just get the value and append it later.
I have tried the try function outside the for loop and there it worked.
I also saw some solutions where a while true was used but that did not work for me.
from lxml.html import fromstring
import requests
import traceback
import time
from bs4 import BeautifulSoup as bs
Item_name=[]
Sell_Copper=[]
items= [47, 48]
url = 'https://classic.wowhead.com/item='
fails=[]
for i in items:
time.sleep(5)
url1=(url+str(i))
session = requests.session()
response = session.get(url1)
soup = bs(response.content, 'lxml')
name=soup.select_one('h1').text
print(name)
#get the buy prices
try:
copper = soup.select_one('li:contains("Sells for") .moneycopper').text
except Exception as e:
copper=str(0)
The expected result would be that i get one value in gold and a list in P_Gold. In this case:
copper='1'
Sell_copper=['1','1']
You don't need a sleep. It needs to be div:contains and the search text needs changing
import requests
from bs4 import BeautifulSoup as bs
Item_name=[]
Sell_Copper=[]
items= [47, 48]
url = 'https://classic.wowhead.com/item='
fails=[]
with requests.Session() as s:
for i in items:
response = s.get(url + str(i))
soup = bs(response.content, 'lxml')
name = soup.select_one('h1').text
print(name)
try:
copper = soup.select_one('div:contains("Sell Price") .moneycopper').text
except Exception as e:
copper=str(0)
print(copper)

Writing multiple files as output when webscraping - python bs4

to preface - I am quite new to python and my HTML skills are kindergarten level.
So I am trying to save the quotes from this website which has many links in it for each member of the US Election candidates.
I have managed to get the actual code to extract the quotes (with the help of soem stackoverflow users), but am lost on how to write these quotes in to separate text files for each candidate.
For example, the first page, with all of Justin Amash's quotes should be written to a file: JustinAmash.txt.
The second page, with all of Michael Bennet's quotes should be written to MichaelBennet.txt (or something in that form). and so on.. Is there a way to do this?
For reference, to scrape the pages, the following code works:
import bs4
from urllib.request import Request,urlopen as uReq, HTTPError
#Import HTTPError in order to avoid the links with no content/resource of interest
from bs4 import BeautifulSoup as soup_
import re
#define url of interest
my_url = 'http://archive.ontheissues.org/Free_Trade.htm'
def make_soup(url):
# set up known browser user agent for the request to bypass HTMLError
req=Request(url,headers={'User-Agent': 'Mozilla/5.0'})
#opening up connection, grabbing the page
uClient = uReq(req)
page_html = uClient.read()
uClient.close()
#html is jumbled at the moment, so call html using soup function
soup = soup_(page_html, "lxml")
return soup
# Test: print title of page
#soup.title
soup = make_soup(my_url)
tags = soup.findAll("a" , href=re.compile("javascript:pop\("))
#print(tags)
# open a text file and write it if it doesn't exist
file1 = open("Quotefile.txt","w")
# get list of all URLS
for links in tags:
link = links.get('href')
if "java" in link:
print("http://archive.ontheissues.org" + link[18:len(link)-3])
main_url = "http://archive.ontheissues.org" + link[18:len(link)-3]
try:
sub_soup = make_soup(main_url)
content_collexn = sub_soup.body.contents #Splitting up the page into contents for iterative access
#text_data = [] #This list can be used to store data related to every person
for item in content_collexn:
#Accept an item if it belongs to the following classes
if(type(item) == str):
print(item.get_text())
elif(item.name == "h3"):
#Note that over here, every h3 tagged title has a string following it
print(item.get_text())
#Hence, grab that string too
print(item.next_sibling)
elif(item.name in ["p", "ul", "ol"]):
print(item.get_text())
except HTTPError: #Takes care of missing pages and related HTTP exception
print("[INFO] Resource not found. Skipping to next link.")
#print(text_data)
You can store that text data into the list you had started with text_data. Join all those items and then write to file:
So something like:
import bs4
from urllib.request import Request,urlopen as uReq, HTTPError
#Import HTTPError in order to avoid the links with no content/resource of interest
from bs4 import BeautifulSoup as soup_
import re
#define url of interest
my_url = 'http://archive.ontheissues.org/Free_Trade.htm'
def make_soup(url):
# set up known browser user agent for the request to bypass HTMLError
req=Request(url,headers={'User-Agent': 'Mozilla/5.0'})
#opening up connection, grabbing the page
uClient = uReq(req)
page_html = uClient.read()
uClient.close()
#html is jumbled at the moment, so call html using soup function
soup = soup_(page_html, "lxml")
return soup
# Test: print title of page
#soup.title
soup = make_soup(my_url)
tags = soup.findAll("a" , href=re.compile("javascript:pop\("))
#print(tags)
# open a text file and write it if it doesn't exist
#file1 = open("Quotefile.txt","w")
# get list of all URLS
candidates = []
for links in tags:
link = links.get('href')
if "java" in link:
#print("http://archive.ontheissues.org" + link[18:len(link)-3])
main_url = "http://archive.ontheissues.org" + link[18:len(link)-3]
candidate = link.split('/')[-1].split('_Free_Trade')[0]
if candidate in candidates:
continue
else:
candidates.append(candidate)
try:
sub_soup = make_soup(main_url)
content_collexn = sub_soup.body.contents #Splitting up the page into contents for iterative access
text_data = [] #This list can be used to store data related to every person
for item in content_collexn:
#Accept an item if it belongs to the following classes
if(type(item) == str):
#print(item.get_text())
text_data.append(item.get_text())
elif(item.name == "h3"):
#Note that over here, every h3 tagged title has a string following it
#print(item.get_text())
text_data.append(item.get_text())
#Hence, grab that string too
#print(item.next_sibling)
text_data.append(item.next_sibling)
elif(item.name in ["p", "ul", "ol"]):
#print(item.get_text())
text_data.append(item.get_text())
except HTTPError: #Takes care of missing pages and related HTTP exception
print("[INFO] Resource not found. Skipping to next link.")
candidates.remove(candidate)
continue
text_data = '\n'.join(text_data)
with open("C:/%s.txt" %(candidate), "w") as text_file:
text_file.write(text_data)
print('Aquired: %s' %(candidate))

How to scrape all the test match details in cricinfo

I am trying to scrape all the test match details but it is showing HTTP Error 504: Gateway Timeout I am getting the details of test matches but it is not showing this is my code i have used bs4 to scrape the test match details from cricinfo
I need to scrape the details of 2000 test matches this is my code
import urllib.request as req
BASE_URL = 'http://www.espncricinfo.com'
if not os.path.exists('./espncricinfo-fc'):
os.mkdir('./espncricinfo-fc')
for i in range(0, 2000):
soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=test;all=1;page=' + str(i)).read())
time.sleep(1)
for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}):
try:
new_host = new_host['href']
except:
continue
odiurl =BASE_URL + urljoin(BASE_URL,new_host)
new_host = unicodedata.normalize('NFKD', new_host).encode('ascii','ignore')
print(new_host)
html = req.urlopen(odiurl).read()
if html:
with open('espncricinfo-fc/{0!s}'.format(str.split(new_host, "/")[4]), "wb") as f:
f.write(html)
print(html)
else:
print("no html")
this is usually happen when doing multiple request too fast, it can be the server is down or your connection blocked by server firewall, try increase your sleep() or add random sleep.
import random
.....
for i in range(0, 2000):
soupy = BeautifulSoup(....)
time.sleep(random.randint(2,6))
not sure why, seems to be working for me.
I made a few changes in the loop through the links. I'm not sure how you're wanting the output to look in terms of writing it to your file, so I left that part alone. But like I said, seems to be working ok on my end.
import bs4
import requests
import os
import time
import urllib.request as req
BASE_URL = 'http://www.espncricinfo.com'
if not os.path.exists('C:/espncricinfo-fc'):
os.mkdir('C:/espncricinfo-fc')
for i in range(0, 2000):
i=0
url = 'http://search.espncricinfo.com/ci/content/match/search.html?search=test;all=1;page=%s' %i
html = requests.get(url)
print ('Checking page %s of 2000' %(i+1))
soupy = bs4.BeautifulSoup(html.text, 'html.parser')
time.sleep(1)
for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}):
try:
new_host = new_host['href']
except:
continue
odiurl = BASE_URL + new_host
new_host = odiurl
print(new_host)
html = req.urlopen(odiurl).read()
if html:
with open('C:/espncricinfo-fc/{0!s}'.format('_'.join(str.split(new_host, "/")[4:])), "wb") as f:
f.write(html)
#print(html)
else:
print("no html")

Can't print tag 'content' anymore

I had a perfectly well working scraper for TripAdvisor, it met all my needs, then I tried to use it after a four day break and something went wrong, I quickly realized that TA had changed some of the tags, I made the appropriate changes and I still couldn't get it working as before. I want to grab the value of the 'content' tag within an element.
This is the element:
<div class="prw_rup prw_common_bubble_rating bubble_rating" data-prwidget-init="" data-prwidget-name="common_bubble_rating"><span alt="5 of 5 bubbles" class="ui_bubble_rating bubble_50" content="5" property="ratingValue" style="font-size:18px;"></span></div>
and here is the code:
for bubs in data.findAll('div',{'class':"prw_rup prw_common_bubble_rating bubble_rating"}):
print([img["content"] for img in bubs.select("img[content]")])
but now it only gives me an empty '[]' instead of the content which is '5'.
Anybody know what may have changed?
here is the rest of my code
import urllib
import urllib.request
from bs4 import BeautifulSoup
import re
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
theurl = "https://www.tripadvisor.com/Hotels-g147364-c3-Cayman_Islands-Hotels.html"
thepage = urllib
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
base_url = "https://www.tripadvisor.com"
urls = []
init_info = []
init_data = open('/Users/paribaker/Desktop/scrapping/TripAdvisor/Inv/speccaydata.txt', 'w')
for link in soup.findAll('a',href=re.compile('/Hotel_Review')):
urls.append(base_url + (link.get('href')).strip("#REVIEWS"))
def remove_duplicates(urls):
output= []
seen = set()
for line in urls:
if line not in seen:
output.append(line)
seen.add(line)
return output
urls2 = remove_duplicates(urls)
for url in urls2:
try:
driver = webdriver.Chrome()
driver.get(url)
element = driver.find_element_by_id("taplc_prodp13n_hr_sur_review_filter_controls_0_filterLang_ALL").click()
print("succesfull")
moreinfo = driver.page_source
moresoup = BeautifulSoup(moreinfo,"html.parser")
driver.close()
#moreinfo = urllib
#moreinfo = urllib.request.urlopen(url)
#moresoup = BeautifulSoup(moreinfo,"html.parser")
except:
print("none")
for data in moresoup.findAll('div', {"class":"heading_2014 hr_heading"}):
try:
for title in data.findAll('h1',{'id':"HEADING"}):
init_info.append(title.text.strip("\n")+ ",\t")
for add_data in data.findAll('span',{'class':'format_address'}):
print((add_data.find('span',{'class':'street-address'}).text +",\t"))
init_info.append(add_data.find('span',{'class':'street-address'}).text +",\t")
init_info.append(add_data.find('span',{'class':'locality'}).text + ",\t")
init_info.append(add_data.find('span',{'class':'country-name'}).text + ",\t")
for reviews in data.findAll('a',{'class':'more taLnk'}):
init_info.append(reviews.text).strip("\n")
init_info.append(", \t")
#init_info.append([img["alt"] for img in stars.select("img[alt]")])
#init_info.append([img["content"] for img in stars.select("img[content]")])
except :
init_info.append("N/A" + ", /t")
The element with the content="5" attribute is a span, not an img.
Does this get what you want?
for bubs in data.findAll('div',{'class':"prw_rup prw_common_bubble_rating bubble_rating"}):
print([elem["content"] for elem in bubs.select("span[content]")])

Resources