Cannot get text from found list though text exists - python-3.x

I am trying to scrape reference texts from this: paper
When I go to the site, the references section does not show up. To see them, I should either click "References" or "+Show References". I am trying to find references link and click it.
Here is my code:
browser.get('https://doi.org/10.3847/1538-4357/abb3c9')
refCheck = ["references", "cited literature", "literature cited", "refs"]
for h in range(0, len(browser.find_elements(By.XPATH, '//a[#href]'))):
textSearch = browser.find_elements(By.XPATH, '//a[#href]')[h].text
href = browser.find_elements(By.XPATH, '//a[#href]')[h].get_attribute("href")
if (textSearch.lower() in refCheck) & (len(href) > 0):
browser.find_elements(By.XPATH, '//a[#href]')[h].get_attribute("href")
print(h)
print(textSearch)
print(href)
break
browser.get(href)
attrList = []
refCheck = ["references", "cited literature", "literature cited", "refs"]
tags = ["ol","ul"]
for t in tags:
if len(browser.find_elements(By.TAG_NAME, t)) > 0:
for i in range(0, len(browser.find_elements(By.TAG_NAME, t))):
for attr in browser.find_elements(By.TAG_NAME, t)[i].get_property('attributes'):
for rc in refCheck:
if (rc in attr['name'].lower()) | (rc in attr['value'].lower()):
attrList.append(t)
attrList.append(i)
attrList.append(attr['name'])
attrList.append(attr['value'])
print(attr['name'])
print(attr['value'])
print(len(browser.find_elements(By.TAG_NAME, t)[i].find_elements(By.XPATH,'./li')))
if len(attrList) > 0:
break
if len(attrList) > 0:
break
if len(attrList) > 0:
break
cnt = 0
for f in browser.find_elements(By.TAG_NAME, t)[i].find_elements(By.XPATH, './li'):
print(f.text)
if len(f.text) > 0:
refList.append(f.text)
cnt += 1
print(cnt)
However, the returned text is always empty.
PS. By the way, I have tried to click href I reached instead of browser.get(href), however it does not work as well. When I tried to get the hyperlink through get_attributes("href"), it always returned a string so could not click.
How should I get that text?
EDIT:
Found the answer here: link
Using get_attribute("textContent") solved my issue.

You can use alternatively crossref.org API if you search by DOI it will give you a JSON response that contains 'reference'. You can play with the json however you want.
import requests
def get_ref(doi):
url = f'https://api.crossref.org/works/{doi}'
response = requests.get(url)
if response.status_code == 200:
response = response.json()
return response['message']['reference']
return None
doi = 'doi.org/10.3847/1538-4357/abb3c9'
get_ref_count(doi)

Related

list index out of range but it seems impossible since it's only after 3 questions

kanji = ['上','下','大','工','八','入','山','口','九','一','人','力','川','七','十','三','二','女',]
reading = ['じょう','か','たい','こう','はち','にゅう','さん','こう','く','いち','にん','りょく','かわ','しち','じゅう','さん','に','じょ']
definition = ['above','below','big','construction','eight','enter','mountain','mouth','nine','one','person','power','river','seven','ten','three','two','woman']
score = number_of_questions = kanji_item = 0
def question_format(prompt_type,lang,solution_selection):
global reading,definition,score,num_of_questions,kanji_item
question_prompt = 'What is the '+str(prompt_type)+' for "'+str(kanji[kanji_item])+'"? (Keyboard:'+str(lang)+')\n'
solution_selection = [reading,definition]
usr = input(question_prompt)
if usr in solution_selection[kanji_item] and kanji[kanji_item]:
score += 1
num_of_questions += 1
else:
pass
kanji_item += 1
while number_of_questions != 18:
question_format('READING','Japanese',[0])
print('You got ',score,'/',number_of_questions)
while number_of_questions != 36:
question_format('DEFINITION','English',[1])
print('You got ',score,'/',number_of_questions)
I can't get past 大. but I can't see where it's messing up. I've tried to change pretty much everything. "kanji_item" is supposed to give a common index number so that the answers can match up. It gets through the first two problems with no hassle, but for some reason refuses to accept my third problem.
Problems:
- wrong name using number_of_questions vs. num_of_questions
- wrong way to check truthyness if usr in solution_selection[kanji_item] and kanji[kanji_item]: - the last part is always True as it is a non empty string
- lots of globals wich is not considered very good style
It would be easier to zip your three list together so you get tuples of (kanji, reading, description) and feed 2 of those into your function depending on what you want to test. You do this 2 times, once for reading, once for description.
You can even randomize your list of tuples to get different "orders" in which questions are asked:
kanji = ['上', '下', '大', '工', '八', '入', '山', '口', '九', '一' , '人',
'力', '川', '七', '十', '三', '二', '女',]
reading = ['じょう', 'か', 'たい', 'こう', 'はち', 'にゅう', 'さん', 'こう', 'く',
'いち', 'にん', 'りょく', 'かわ', 'しち', 'じゅう', 'さん', 'に', 'じょ']
definition = ['above', 'below', 'big', 'construction', 'eight', 'enter', 'mountain',
'mouth', 'nine', 'one', 'person', 'power', 'river', 'seven', 'ten', 'three',
'two', 'woman']
import random
data = list(zip(kanji, reading, definition))
random.shuffle(data)
def question_format(prompt_type, lang, kanji, solution):
"""Creates a question about *kanji* - the correct answer is *solution*
Returns 1 if correct else 0."""
question_prompt = f'What is the {prompt_type} for {kanji}? (Keyboard: {lang})'
usr = input(question_prompt)
if usr == solution:
return 1
else:
return 0
questions_asked = 0
correct = 0
for (kanji, reading, _) in data:
correct += question_format('READING','Japanese', kanji, reading)
questions_asked += 1
print('You got ',correct,'/',questions_asked)
for (kanji, _, definition) in data:
correct += question_format('DEFINITION','ENGLISH', kanji, definition)
questions_asked += 1
print('You got ',correct,'/',questions_asked)
After zipping our list and shuffling them data looks like
[('山', 'さん', 'mountain'), ('女', 'じょ', 'woman'), ('力', 'りょく', 'power'),
('上', 'じょう', 'above'), ('九', 'く', 'nine'), ('川', 'かわ', 'river'),
('入', 'にゅう', 'enter'), ('三', 'さん', 'three'), ('口', 'こう', 'mouth'),
('二', 'に', 'two'), ('人', 'にん', 'person'), ('七', 'しち', 'seven'),
('一', 'いち', 'one'), ('工', 'こう', 'construction'), ('下', 'か', 'below'),
('八', 'はち', 'eight'), ('十', 'じゅう', 'ten'), ('大', 'たい', 'big')]

XGetWindowProperty and ctypes

Question
I'm trying to find NET_WM_NAME property for each of the window/client that X11 reports. Problem is that there's nothing returned - number of items is 0 and returned data results in empty string. I've looked at multiple code examples through out github and examples written in C and C++ , specifically Why is XGetWindowProperty returning null? as well as Xlib XGetWindowProperty Zero items returned , however I cannot find where is the problem with my code. Seemingly everything is fine, order of parameters passed to XGetWindowProperty function is in accordance with documentation, and the function returns success status, but results are empty. Where is the problem with my code ?
Code
Below is the code I am working with. The issue is xgetwindowproperty function. The other parts below it work fine, and are provided only for completeness.
#! /usr/bin/env python3
import sys
from ctypes import *
def xgetwindowproperty(display,w):
actual_type_return = c_ulong()
actual_format_return = c_int()
nitems_return = c_ulong()
bytes_after_return = c_ulong()
prop_return = POINTER(c_ubyte)()
wm_name = Xlib.XInternAtom(display,'_NET_WM_NAME',False)
utf8atom = Xlib.XInternAtom(display,'UTF8_STRING',False)
print('_NET_WM_NAME',wm_name, 'UTF8_STRING',utf8atom)
# AnyPropertyType = c_long(0)
status = Xlib.XGetWindowProperty(
display,
w,
wm_name,
0,
65536,
False,
utf8atom,
byref(actual_type_return),
byref(actual_format_return),
byref(nitems_return),
byref(bytes_after_return),
byref(prop_return)
)
print(nitems_return.value) # returns 0
# empty string as result
print( 'Prop', ''.join([ chr(c) for c in prop_return[:bytes_after_return.value] ]) )
Xlib.XFree(prop_return)
print('#'*10)
# -------
Xlib = CDLL("libX11.so.6")
display = Xlib.XOpenDisplay(None)
if display == 0:
sys.exit(2)
w = Xlib.XRootWindow(display, c_int(0))
root = c_ulong()
children = POINTER(c_ulong)()
parent = c_ulong()
nchildren = c_uint()
Xlib.XQueryTree(display, w, byref(root), byref(parent), byref(children), byref(nchildren))
for i in range(nchildren.value):
print("Child:",children[i])
xgetwindowproperty(display,children[i])

UnboundLocalError: Referenced before assignment local variable 'emoji_count'

Hi I had this niggling issue with a cog (bot module) on writing and I keep getting an UnboundLocalError: Referenced before assignment I'm aware this is a very common issue however I'm not seeing the issue.
The module works but every time a post is reacted to with a star it throws off this error in the console.
The error is:
starboard.py", line 22, in on_reaction_add
if emoji_count > 0: #if 0 then 1 counts
UnboundLocalError: local variable 'emoji_count' referenced before assignment
The area of more specific I'm looking at is:
async def on_reaction_add(self, reaction, user):
for guild in self.bot.guilds:
chan = get(guild.channels, name="starboard")
if chan:
if reaction.message.author == user:
return
if reaction.emoji == '⭐' or reaction.emoji == '🌟':
if not chan:
return
emoji_count = reaction.message.reactions[0].count
msg = f"{reaction.message.author.mention} your post was posted to starboard."
em = discord.Embed(color=discord.Color(random.randint(0x000000, 0xFFFFFF)))
display = f"""{reaction.message.content}"""
em.description = display
em.set_author(name=reaction.message.author.name, icon_url=reaction.message.author.avatar_url)
em.set_footer(text=f"Posted in: #{chan.name}")
em.timestamp = dt.datetime.utcnow()
try:
img_url = reaction.message.attachments[0].url
except IndexError:
img_url = None
if not img_url:
try:
img_url = reaction.message.embeds[0].url
except IndexError:
img_url = None
if img_url:
em.set_image(url=str(img_url))
if emoji_count > 0: #if 0 then 1 counts
if not chan:
return
await chan.send(msg)
await chan.send(embed=em)
If anyone can tell me whats going on here and where I'm going wrong I'd much appreciated it.
When your if statement condition in if reaction.emoji == '⭐' or reaction.emoji == '🌟': doesn't return True , emoji_count won't get initialized
(emoji_count = reaction.message.reactions[0].count)
So when you try to use it a couple of lines underneath in if emoji_count > 0: it causes
local variable 'emoji_count' referenced before assignment, which is exactly what it says, python not being able to find your variable's initialization anywhere in the code that ran
I think what is being said here is the following:
if emoji_count >= 2 :
if not chan:
return True
As said in the previous answers it should be
if reaction.emoji == '⭐' or reaction.emoji == '🌟' is True:

Getting the number of old issues and the table (login and number of commits) of the most active members of the repository

I can not get the above information using github.api. Reading the documentation did not help much. There is still no complete understanding of the work with dates. Here is an example of my code for getting open issues:
import requests
import json
from datetime import datetime
username = '\'
password = '\'
another_page = True
opened = 0
closed = 0
api_oldest = 'https://api.github.com/repos/grpc/grpc/issues?
per_page=5&q=sort=created:>`date -v-14d "+%Y-%m-%d"`&order=asc'
api_issue = 'https://api.github.com/repos/grpc/grpc/issues?
page=1&per_page=5000'
api_pulls = 'https://api.github.com/repos/grpc/grpc/pulls?page=1'
datetime.now()
while another_page:
r = requests.get(api_issue, auth=(username, password))
#json_response = json.loads(r.text)
#results.append(json_response)
if 'next' in r.links:
api_issue = r.links['next']['url']
if item['state'] == 'open':
opened += 1
else:
closed += 1
else:
another_page=False
datetime.now()
print(opened)
There are a few issues with your code. For example, what does item represent ?. Your code can be modified as follows to iterate and get the number of open issues .
import requests
username = '/'
password = '/'
another_page = True
opened = 0
closed = 0
api_issue = "https://api.github.com/repos/grpc/grpc/issues?page=1&per_page=5000"
while another_page:
r = requests.get(api_issue, auth=(username, password))
json_response = r.json()
#results.append(json_response)
for item in json_response:
if item['state'] == 'open':
opened += 1
else:
closed += 1
if 'next' in r.links:
api_issue = r.links['next']['url']
else:
another_page=False
print(opened)
If you want issues that were created in the last 14 days, you could make the api request using the following URL.
api_oldest = "https://api.github.com/repos/grpc/grpc/issues?q=sort=created:>`date -d '14 days ago'`&order=asc"

Last page not showing in scrapy

So my code (pasted) below almost does what I want. Instead, it covers 29/30 pages, and then leaves out the last. Furthermore, I would preferably have it go beyond, but the website has no button for it (the pages actually do work when you manually fill in page=31 in the link). When Depth_Limit is 29 it's all fine, but on 30 I get the following error in the command prompt:
File "C:\Users\Ewald\Scrapy\OB\OB\spiders\spider_OB.py", line 23, in parse
next_link = 'https://zoek.officielebekendmakingen.nl/' + s.xpath('//a[#class="volgende"]/#href').extract()[0]
IndexError: list index out of range
I've tried various approaches, but they all seem to fail me...
class OB_Crawler(CrawlSpider):
name = 'OB5'
allowed_domains = ["https://www.officielebekendmakingen.nl/"]
start_urls = ["https://zoek.officielebekendmakingen.nl/zoeken/resultaat/?zkt=Uitgebreid&pst=Tractatenblad|Staatsblad|Staatscourant|BladGemeenschappelijkeRegeling|ParlementaireDocumenten&vrt=Cybersecurity&zkd=InDeGeheleText&dpr=Alle&sdt=DatumPublicatie&ap=&pnr=18&rpp=10&_page=1&sorttype=1&sortorder=4"]
custom_settings = {
'BOT_NAME': 'OB-crawler',
'DEPTH_LIMIT': 30,
'DOWNLOAD_DELAY': 0.1
}
def parse(self, response):
s = Selector(response)
next_link = 'https://zoek.officielebekendmakingen.nl/' + s.xpath('//a[#class="volgende"]/#href').extract()[0]
if len(next_link):
yield self.make_requests_from_url(next_link)
posts = response.selector.xpath('//div[#class = "lijst"]/ul/li')
for post in posts:
i = TextPostItem()
i['title'] = ' '.join(post.xpath('a/#href').extract()).replace(';', '').replace(' ', '').replace('\r\n', '')
i['link'] = ' '.join(post.xpath('a/text()').extract()).replace(';', '').replace(' ', '').replace('\r\n', '')
i['info'] = ' '.join(post.xpath('a/em/text()').extract()).replace(';', '').replace(' ', '').replace('\r\n', '').replace(',', '-')
yield i
The index out of range error is the result of an incorrect xpath (you end up calling for the first item of an empty list).
change your "next_link = ... " to
next_link = 'https://zoek.officielebekendmakingen.nl/' + s.xpath('//a[contains(#class, "volgende")]/#href').extract()[0]
You need to use contains, which runs a predicate search.. filters for what you want

Resources