I am scraping this website : https://www.epicery.com/c/promos?gclid=CjwKCAjw97P5BRBQEiwAGflV6bGzNEAz7MTIrgelBkTR277v3lhStP5tH0wgxuLj1ytlcQAAjb-cxBoCsVwQAvD_BwE
And I am trying to retrive some info in the script path like the description.
I get the script content with the xpath and make some regex and try to load it as json:
script_path = response.xpath('/html/body/script[1]').get()
j_list = re.findall(r'\[(.*)\}\]',script_path)
j = j[0].replace("'","")
json_script = json.loads(j)
But I have this following error that I cannot handle :
raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 1 column 152446 (char 152445)
I'm not sure what do you want but this works for me:
def parse(self, response):
taxons_str = response.xpath('//script[contains(., "var taxons")]/text()').re_first(r'(?s)var taxons = (.+?)var shops')
if taxons_str:
taxons = json.loads(taxons_str)
for product in taxons:
process_your_product(product)
Related
I am in the process of writting a python script that downloads a copy of quarantined emails from our gateway. The emails are in .eml format ( text ) so was thinking this would be easy but the resulting file does not open in Outlook properly due to the newlines added .
Here is the download/write function :
def download_message(api_key,endpoint,id):
endpoint = endpoint + "email/" + id
headers = {
'x-fireeye-api-key': api_key,
'content-type': 'application/json',
'Accept': 'application/json'}
data = {}
r = requests.get(endpoint, headers=headers, data=json.dumps(data))
if "not found in quarantine." in r.text:
print("Not found in Quarantine")
else:
filename = base_directory + id + ".eml"
f = open(filename,"w")
f.write(r.text)
f.close()
print("Written : " + id + ".eml" + " to disk")
Here is an example of the output file when opened in a text editor
When opened in Outlook this is what it looks like :
If i manually remove all those blank lines ( regex : ^\n ) and save the file it works as expected.
I have tried quite a few ways of removing those blank lines including strip , rstrip , re.sub and nothing seems to have worked.
If it helps what i was trying to do was create a new variable to hold the "modified" text and then pass that to the write function.
would have looked something like this ( sorry i have tried loads of variations which i have saved but i think you will get the point )
filedata = r.body.strip("\n") or filedata = re.sub('^\n' , "" , r.text)
...
f.write(filedata)
Can anyone help ?
I need help regarding pytest html report customization. I need to print failed network request status code(By TestCase wise) in report so I did the below code. StatusCode column created successfully but not getting data in html report. also, test case-wise statuscode row does not appear in the report.
Conftest.py
#pytest.mark.optionalhook
def pytest_html_results_table_header(cells):
cells.append(html.th('Statuscode'))
Conftest.py
#pytest.mark.optionalhook
def pytest_html_results_table_header(cells):
cells.append(html.th('Statuscode'))
#pytest.mark.optionalhook
def pytest_html_result_table_row(report,cells):
cells.append(html.td(report.statuscode))
def pytest_runtest_makereport(item):
"""
Extends the PyTest Plugin to take and embed screenshot in html report, whenever test fails.
:param item:
"""
pytest_html = item.config.pluginmanager.getplugin('html')
outcome = yield
report = outcome.get_result()
setattr(report, "duration_formatter", "%H:%M:%S.%f")
extra = getattr(report, 'extra', [])
statuscode = []
if report.when == 'call' or report.when == "setup":
xfail = hasattr(report, 'wasxfail')
if (report.skipped and xfail) or (report.failed and not xfail):
file_name = report.nodeid.replace("::", "_")+".png"
_capture_screenshot(file_name)
if file_name:
html = '<div><img src="%s" alt="screenshot" style="width:304px;height:228px;" ' \
'onclick="window.open(this.src)" align="right"/></div>' % file_name
extra.append(pytest_html.extras.html(html))
for request in driver.requests:
if url in request.url and request.response.status_code >=400 and request.response.status_code <= 512:
statuscode.append(request.response.status_code)
print("*********Status codes************",statuscode)
report.statuscode=statuscode
report.extra = extra
I am new to scraping and I am trying to scrape data from this website https://seffaflik.epias.com.tr/transparency/uretim/gerceklesen-uretim/gercek-zamanli-uretim.xhtml
When I try to get data without applying filters everything is working. But the data I need should be for a specific power plant and date. I am getting a hard time finding why I cannot apply the filters.
from scrapy.http import FormRequest
from ..items import EpiasscrapingItem
class EpiasSpider(scrapy.Spider):
name = 'epias'
start_urls =[
'https://seffaflik.epias.com.tr/transparency/uretim/gerceklesen-uretim/gercek-zamanli-uretim.xhtml'
]
def parse(self, response):
return FormRequest.from_response(response, formdata = {
'j_idt205':'j_idt205',
'j_idt205:date1_input' : '20.03.2021',
'j_idt205:date2_input' : '20.03.2021',
'j_idt205:powerPlant_input' : '2614',
}, callback=self.start_scraping)
def start_scraping(self,response):
items = EpiasscrapingItem()
table_epias = response.css('.ui-datatable-odd')
for epias in table_epias:
date = epias.css('.ui-widget-content .TexAlCenter:nth-child(1)').css('::text').extract()
time = epias.css('.ui-widget-content .TexAlCenter:nth-child(2)').css('::text').extract()
biogas = epias.css('.ui-widget-content .TexAlCenter:nth-child(15)').css('::text').extract()
items['date'] = date
items['time'] = time
items['biogas'] = biogas
yield items```
You forgot to include javax.faces.ViewState and few other fields within parameters supposed to be sent with post requests. You can now change the value of date1_input, date2_input and powerPlant_input to fetch the relevant content. The following script should work:
class EpiasSpider(scrapy.Spider):
name = 'epias'
start_urls = [
'https://seffaflik.epias.com.tr/transparency/uretim/gerceklesen-uretim/gercek-zamanli-uretim.xhtml'
]
post_url = 'https://seffaflik.epias.com.tr/transparency/uretim/gerceklesen-uretim/gercek-zamanli-uretim.xhtml'
def parse(self, response):
payload = {
'j_idt205': 'j_idt205',
'j_idt205:date1_input': '11.02.2021',
'j_idt205:date2_input': '20.03.2021',
'j_idt205:powerPlant_focus': '',
'j_idt205:powerPlant_input': '2336',
'j_idt205:goster': '',
'j_idt205:dt_rppDD': '24',
'javax.faces.ViewState': response.css(".ContainerIndent input[name='javax.faces.ViewState']::attr(value)").get()
}
yield scrapy.FormRequest(self.post_url,formdata=payload,callback=self.parse_content)
def parse_content(self,response):
for epias in response.css('.ui-datatable-odd'):
items = {}
date = epias.css('tr.ui-widget-content > .TexAlCenter:nth-child(1)::text').get()
time = epias.css('tr.ui-widget-content > .TexAlCenter:nth-child(2)::text').get()
total = epias.css('tr.ui-widget-content > .TexAlCenter:nth-child(3)::text').get()
items['date'] = date
items['time'] = time
items['total'] = total
yield items
I'm currently working with errbot, but i'm having trouble with allowing users to enter a message to be passed along with the curl command. my plugin looks as follows:
#arg_botcmd('team_key', type=str)
#arg_botcmd('--message' , dest='message', type=str)
def oncall_page(self, msg, team_key=None, message=None):
if team_key in page_list.keys():
team_id = page_list[team_key]
data = {"message_type":"CRITICAL","state_message":"{0}".format(message)}
response = requests.post('https://www.apiurl.com/{0}'.format( team_id), data)
yield "Paging {0} ".format( team_id )
My issue is with this line:
data = {"message_type":"CRITICAL","state_message":"{0}".format(message)}
This seems to be crashing the command completely, I'm hoping users can execute one command such as "!oncall page team_name --message "
Any help would be appreciated:)
#arg_botcmd('team_key', type=str)
#arg_botcmd('--message' , dest='message', type=str)
def oncall_page(self, msg, team_key=None, message=None):
if team_key in page_list.keys():
team_id = page_list[team_key]
text = str(message)
msg_type = "critical"
data = '{"message_type":"%s", "state_message":"%s"}' % (msg_type, text)
# data = '{"message_type":"critical", "state_message":"%s"}'(text)
URL = 'https://www.apiurl.com/{0}'.format( team_id)
response = requests.post(URL , data)
This is the fix for this!
So my code (pasted) below almost does what I want. Instead, it covers 29/30 pages, and then leaves out the last. Furthermore, I would preferably have it go beyond, but the website has no button for it (the pages actually do work when you manually fill in page=31 in the link). When Depth_Limit is 29 it's all fine, but on 30 I get the following error in the command prompt:
File "C:\Users\Ewald\Scrapy\OB\OB\spiders\spider_OB.py", line 23, in parse
next_link = 'https://zoek.officielebekendmakingen.nl/' + s.xpath('//a[#class="volgende"]/#href').extract()[0]
IndexError: list index out of range
I've tried various approaches, but they all seem to fail me...
class OB_Crawler(CrawlSpider):
name = 'OB5'
allowed_domains = ["https://www.officielebekendmakingen.nl/"]
start_urls = ["https://zoek.officielebekendmakingen.nl/zoeken/resultaat/?zkt=Uitgebreid&pst=Tractatenblad|Staatsblad|Staatscourant|BladGemeenschappelijkeRegeling|ParlementaireDocumenten&vrt=Cybersecurity&zkd=InDeGeheleText&dpr=Alle&sdt=DatumPublicatie&ap=&pnr=18&rpp=10&_page=1&sorttype=1&sortorder=4"]
custom_settings = {
'BOT_NAME': 'OB-crawler',
'DEPTH_LIMIT': 30,
'DOWNLOAD_DELAY': 0.1
}
def parse(self, response):
s = Selector(response)
next_link = 'https://zoek.officielebekendmakingen.nl/' + s.xpath('//a[#class="volgende"]/#href').extract()[0]
if len(next_link):
yield self.make_requests_from_url(next_link)
posts = response.selector.xpath('//div[#class = "lijst"]/ul/li')
for post in posts:
i = TextPostItem()
i['title'] = ' '.join(post.xpath('a/#href').extract()).replace(';', '').replace(' ', '').replace('\r\n', '')
i['link'] = ' '.join(post.xpath('a/text()').extract()).replace(';', '').replace(' ', '').replace('\r\n', '')
i['info'] = ' '.join(post.xpath('a/em/text()').extract()).replace(';', '').replace(' ', '').replace('\r\n', '').replace(',', '-')
yield i
The index out of range error is the result of an incorrect xpath (you end up calling for the first item of an empty list).
change your "next_link = ... " to
next_link = 'https://zoek.officielebekendmakingen.nl/' + s.xpath('//a[contains(#class, "volgende")]/#href').extract()[0]
You need to use contains, which runs a predicate search.. filters for what you want