How to scrape headings in about page? - python-3.x

I'm trying to scrape the headings in about page but I tried so much and failed due to not the proper understanding of what to do? I'm a beginner. So I require help.
import scrapy
from ..items import DmoztutorialItem
class DmozSpiderSpider(scrapy.Spider):
name = 'Dmoz'
start_urls = [
'http://dmoz-odp.org/',
]
def parse(self, response):
items = DmoztutorialItem()
Navbar = response.css('#main-nav a::text').extract()
Category_names = response.css('.top-cat a::text').extract()
Subcategories = response.css('.sub-cat a::text').extract()
items['Navbar'] = Navbar
items['Category_names'] = Category_names
items['Subcategories'] = Subcategories
yield items
# Nav_page = response.css('#main-nav a::attr(href)').extract()
Nav_page = 'http://dmoz-odp.org/docs/en/about.html'.extract()
# About_heading = response.css('h1+ p , #mainContent
# h1::text').extract()
items['Nav_page'] = Nav_page
# items['About_heading'] = About_heading
yield response.follow(Nav_page)

Can you tell what kind of output do you need? It is very unclear from your post.
Check this example, where you can:
Get some data;
Call request to another page with date saved;
Yield final data.
Hope it will help you.
import scrapy
class DmozSpiderSpider(scrapy.Spider):
name = 'Dmoz'
start_urls = ['http://dmoz-odp.org/']
nav_page = 'http://dmoz-odp.org/docs/en/about.html'
def parse(self, response):
# collect data on first page
items = {
'Navbar': response.css('#main-nav a::text').extract(),
'Category_names': response.css('.top-cat a::text').extract(),
'Subcategories': response.css('.sub-cat a::text').extract(),
'Nav_page': self.nav_page,
}
# save and call request to another page
yield response.follow(self.nav_page, self.parse_nav, meta={'items': items})
def parse_nav(self, response):
# do you stuff on second page
items = response.meta['items']
items['something'] = 'something' # add your logics
yield items
Or make separate logins for separate pages:
import scrapy
class DmozSpiderSpider(scrapy.Spider):
name = 'Dmoz'
def start_requests(self):
reqs = (
('http://dmoz-odp.org/', self.parse_main),
('http://dmoz-odp.org/docs/en/about.html', self.parse_nav),
)
for link, callback in reqs:
yield scrapy.Request(link, callback)
def parse_main(self, response):
items = {
'Navbar': response.css('#main-nav a::text').extract(),
'Category_names': response.css('.top-cat a::text').extract(),
'Subcategories': response.css('.sub-cat a::text').extract(),
}
yield items
def parse_nav(self, response):
items = {
'something': 'something', # add your logics
}
yield items

To parse a different HTML page, you need to yield a Request object with the target URL as the first argument for its constructor, and do the parsing in the method of your spider that you pass to the constructor of that Request object as the callback parameter.
I strongly encourage you to complete the Scrapy tutorial. What you are trying to achieve is covered in the Following links section.

Related

How to get all data certain website with Scrapy?

I'm currently learn scraping using scrapy, so I want to scrape data from this: https://www.espn.com/nba/stats/player or https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2
if you go to the link you will see the show more button at the bottom data so im confuse about this because if i scrape that right now im only get 50 of data that, not what i want, so i decided to look up the show more button but its only a href=#
[UPDATE] USING Scrapy+playwright
def start_requests(self):
yield scrapy.Request(
url='https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2',
meta= dict(
playwright = True,
playwright_include_page=True,
playwright_page_coroutines = [
PageMethod('wait_for_selector','//a[#class="AnchorLink loadMore__link"]'),
PageMethod('click','//a[#class="AnchorLink loadMore__link"]'),
]
),
callback=self.parse,
)
async def parse(self, response):
page = response.meta['playwright_page']
button = response.meta['playwright_page_coroutines'][0]
if button:
await button.click()
resp = response.body
player_list = sel.xpath(
"//table[#class='Table Table--align-right Table--fixed Table--fixed-left']//tbody//tr")
stats_list = sel.xpath(
"//div[#class='Table__ScrollerWrapper relative overflow-hidden']/div[#class='Table__Scroller']/table/tbody/tr")
await page.wait_for_selector(player_list)
sel = Selector(text=resp)
for player, stat in zip(player_list, stats_list):
player_name = player.xpath(".//a/text()").get()
position = stat.xpath(".//td/div/text()").get()
team_name = player.xpath(".//span/text()").get()
game_played = stat.xpath(".//td[2]/text()").get()
minutes_per_minute = stat.xpath(".//td[3]/text()").get()
points_per_game = stat.xpath(".//td[4]/text()").get()
fields_goal_made = stat.xpath(".//td[5]/text()").get()
fields_goal_attempted = stat.xpath(".//td[6]/text()").get()
field_goal_percentage = stat.xpath(".//td[7]/text()").get()
three_point_goal_made = stat.xpath(".//td[8]/text()").get()
yield {
"player_name": player_name,
"player_position": position,
"team_name": team_name,
"game_played": game_played,
"minutes_per_minute": minutes_per_minute,
"points_per_game": points_per_game,
"fields_goal_made": fields_goal_made,
"fields_goal_attempted": fields_goal_attempted,
"field_goal_percentage": field_goal_percentage,
"three_point_goal_made": three_point_goal_made,
}
When only using scrapy
def start_requests(self):
yield scrapy.Request(
url='https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2',
callback=self.parse,
)
def parse(self, response):
sel = Selector(text=response.body)
player_list = sel.xpath("//table[#class='Table Table--align-right Table--fixed Table--fixed-left']//tbody//tr")
stats_list = sel.xpath("//div[#class='Table__ScrollerWrapper relative overflow-hidden']/div[#class='Table__Scroller']/table/tbody/tr")
for player,stat in zip(player_list,stats_list):
player_name = player.xpath(".//a/text()").get()
position = stat.xpath(".//td/div/text()").get()
team_name = player.xpath(".//span/text()").get()
game_played = stat.xpath(".//td[2]/text()").get()
minutes_per_minute= stat.xpath(".//td[3]/text()").get()
points_per_game = stat.xpath(".//td[4]/text()").get()
fields_goal_made = stat.xpath(".//td[5]/text()").get()
fields_goal_attempted = stat.xpath(".//td[6]/text()").get()
field_goal_percentage = stat.xpath(".//td[7]/text()").get()
three_point_goal_made = stat.xpath(".//td[8]/text()").get()
yield {
"player_name": player_name,
"player_position":position,
"team_name": team_name,
"game_played": game_played,
"minutes_per_minute": minutes_per_minute,
"points_per_game": points_per_game,
"fields_goal_made":fields_goal_made,
"fields_goal_attempted":fields_goal_attempted,
"field_goal_percentage":field_goal_percentage,
"three_point_goal_made":three_point_goal_made,
}
Am i doing it wrong here? also if you click show more it will show the api like down bellow, i can scrape from that api but for now i want it from html with xpath it self.

How to scrape a website with multiple pages with the same url adress using scrapy-playwright

I am trying to scrape a website with multiple pages with the same url using scrapy-playwright.
the following script returned only the data of the second page and did not continue to the rest of the pages.
can anyone suggest how I can fix it?
import scrapy
from scrapy_playwright.page import PageMethod
from scrapy.crawler import CrawlerProcess
class AwesomeSpideree(scrapy.Spider):
name = "awesome"
def start_requests(self):
# GET request
yield scrapy.Request(
url=f"https://www.cia.gov/the-world-factbook/countries/" ,
callback = self.parse,
meta=dict(
playwright = True,
playwright_include_page = True,
playwright_page_methods = {
"click" : PageMethod('click',selector = 'xpath=//div[#class="pagination-controls col-lg-6"]//span[#class="pagination__arrow-right"]'),
"screenshot": PageMethod("screenshot", path=f"step1.png", full_page=True)
},
)
)
async def parse(self, response):
page = response.meta["playwright_page"]
await page.close()
print("-"*80)
CountryLst = response.xpath("//div[#class='col-lg-9']")
for Country in CountryLst:
yield {
"country_link": Country.xpath(".//a/#href").get()
}
I see you are trying to fetch URLs of countries from above mentioned URL.
if you inspect the Network tab you can see there is one request to one JSON data API. You can fetch all countries URL's from this url
after that if you still want scrap more data from scraped URL's then you can easily scrap because that data is static so there will be no need to use playwright.
Have a good day :)

Attempting login with Scrapy-Splash

Since i am not able to login to https://www.duif.nl/login, i tried many different methods like selenium, which i successfully logged in, but didnt manage to start crawling.
Now i tried my luck with scrapy-splash, but i cant login :(
If i render the loginpage with splash, i see following picture:
Well, there should be a loginform, like username and password, but scrapy cant see it?
Im sitting here like a week in front of that loginform and losing my will to live..
My last question didnt even get one answer, now i try it again.
here is the html code of the login-form:
When i login manual, i get redirected to "/login?returnUrl=", where i only have these form_data:
My Code
# -*- coding: utf-8 -*-
import scrapy
from scrapy_splash import SplashRequest
from scrapy.spiders import CrawlSpider, Rule
from ..items import ScrapysplashItem
from scrapy.http import FormRequest, Request
import csv
class DuifSplash(CrawlSpider):
name = "duifsplash"
allowed_domains = ['duif.nl']
login_page = 'https://www.duif.nl/login'
with open('duifonlylinks.csv', 'r') as f:
reader = csv.DictReader(f)
start_urls = [items['Link'] for items in reader]
def start_requests(self):
yield SplashRequest(
url=self.login_page,
callback=self.parse,
dont_filter=True
)
def parse(self, response):
return FormRequest.from_response(
response,
formdata={
'username' : 'not real',
'password' : 'login data',
}, callback=self.after_login)
def after_login(self, response):
accview = response.xpath('//div[#class="c-accountbox clearfix js-match-height"]/h3')
if accview:
print('success')
else:
print(':(')
for url in self.start_urls:
yield response.follow(url=url, callback=self.parse_page)
def parse_page(self, response):
productpage = response.xpath('//div[#class="product-details col-md-12"]')
if not productpage:
print('No productlink', response.url)
for a in productpage:
items = ScrapysplashItem()
items['SKU'] = response.xpath('//p[#class="desc"]/text()').get()
items['Title'] = response.xpath('//h1[#class="product-title"]/text()').get()
items['Link'] = response.url
items['Images'] = response.xpath('//div[#class="inner"]/img/#src').getall()
items['Stock'] = response.xpath('//div[#class="desc"]/ul/li/em/text()').getall()
items['Desc'] = response.xpath('//div[#class="item"]/p/text()').getall()
items['Title_small'] = response.xpath('//div[#class="left"]/p/text()').get()
items['Price'] = response.xpath('//div[#class="price"]/span/text()').get()
yield items
In my "prework", i crawled every internal link and saved it to a .csv-File, where i analyse which of the links are product links and which are not.
Now i wonder, if i open a link of my csv, it opens an authenticated session or not?
I cant find no cookies, this is also strange to me
UPDATE
I managed to login successfully :-) now i only need to know where the cookies are stored
Lua Script
LUA_SCRIPT = """
function main(splash, args)
splash:init_cookies(splash.args.cookies),
splash:go("https://www.duif.nl/login"),
splash:wait(0.5),
local title = splash.evaljs("document.title"),
return {
title=title,
cookies = splash:get_cookies(),
},
end
"""
I don't think using Splash here is the way to go, as even with a normal Request the form is there: response.xpath('//form[#id="login-form"]')
There are multiple forms available on the page, so you have to specify which form you want to base yourself on to make a FormRequest.from_response. Best specify the clickdata as well (so it goes to 'Login', not to 'forgot password'). In summary it would look something like this:
req = FormRequest.from_response(
response,
formid='login-form',
formdata={
'username' : 'not real',
'password' : 'login data'},
clickdata={'type': 'submit'}
)
If you don't use Splash, you don't have to worry about passing cookies - this is taken care of by Scrapy. Just make sure you don't put COOKIES_ENABLED=False in your settings.py

How to crawl dynamically generated data on google's webstore search results

I want to crawl a web page which shows the results of a search in google's webstore and the link is static for that particular keyword.
I want to find the ranking of an extension periodically.
Here is the URL
Problem is that I can't render the dynamic data generated by Javascript code in response from server.
I tried using Scrapy and Scrapy-Splash to render the desired page but I was still getting the same response. I used Docker to run an instance of scrapinghub/splash container on port 8050. I even visited the webpage http://localhost:8050 and entered my URL manually but it couldn't render the data although the message showed success.
Here's the code I wrote for the crawler. It actually does nothing and its only job is to fetch the HTML contents of the desired page.
import scrapy
from scrapy_splash import SplashRequest
class WebstoreSpider(scrapy.Spider):
name = 'webstore'
def start_requests(self):
yield SplashRequest(
url='https://chrome.google.com/webstore/search/netflix%20vpn?utm_source=chrome-ntp-icon&_category=extensions',
callback=self.parse,
args={
"wait": 3,
},
)
def parse(self, response):
print(response.text)
and the contents of the settings.py of my Scrapy project:
BOT_NAME = 'webstore_cralwer'
SPIDER_MODULES = ['webstore_cralwer.spiders']
NEWSPIDER_MODULE = 'webstore_cralwer.spiders'
ROBOTSTXT_OBEY = False
SPLASH_URL = 'http://localhost:8050'
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
And for the result I always get nothing.
Any help is appreciated.
Works for me with a small custom lua script:
lua_source = """
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(5.0))
return {
html = splash:html(),
}
end
"""
You can then change your start_requests as follows:
def start_requests(self):
yield SplashRequest(
url='https://chrome.google.com/webstore/search/netflix%20vpn?utm_source=chrome-ntp-icon&_category=extensions',
callback=self.parse,
args={'lua_source': self.lua_source},
)

Remove all dom elements with specific tag?

I try to remove all head:book elements from following request excerpt:
<head:bookstore>
<head:book>9</head:book>
<head:book>10</head:book>
</head:bookstore>
requestHolder.getDomNodes("//head:bookstore/head:book").each {
requestNode.removeChild(it)
}
What am I doing wrong here?
updatE:
def groovyUtils = new com.eviware.soapui.support.GroovyUtils(context)
def requestHolder = groovyUtils.getXmlHolder("openBook#Request")
Here below the solution to remove the nodes of the XML, please notice that you can get the entire xml and you may need to edit the nodes name of the below script
// Get the entire request
def request = context.expand( '${Test Request#Request#declare namespace soap=\'http://www.w3.org/2003/05/soap-envelope\'; //soap:Envelope[1]}' )
// or create the XML
def BOOKS = '''
<bookstore>
<book>9</book>
<book>10</book>
</bookstore>
'''
def booksParser = new XmlParser().parseText(BOOKS)
def allBooks = booksParser.children()
booksParser.remove(allBooks)
new XmlNodePrinter().print(booksParser)
log.info booksParser

Resources