Delete duplicated feed titles crawled by webhoseio - python-3.x

Got news feeds crawled by webhoseio. Now need to delete the duplicated titles. Below are my codes. Something must be wrong because the output still has duplicated titles. Please help find the problem. Thanks.
count_dup = 0
for j in range(0,len(feeds)):
SELECTED_INDEX = j
feed_sel = feeds[SELECTED_INDEX]
#print(feed_sel['title'])
feed_hash = Simhash(str(feed_sel['title']))
dup_indices = index.get_near_dups(feed_hash)
#print("Number of duplicates (SimHash): " + str(len(dup_indices)))
for dupi in dup_indices:
try:
score = calc_similarity(feed_sel['title'], feeds[int(dupi)]['title'], model_word2vec)
except:
score = 0
if score > 0.85:
if feeds[int(dupi)]['id'] == j:
print(feeds[int(dupi)]['id'], feeds[int(dupi)]['title'])
else:
feeds.pop(feeds[int(dupi)]['id'] - count_dup)
count_dup += 1

Related

Cannot get text from found list though text exists

I am trying to scrape reference texts from this: paper
When I go to the site, the references section does not show up. To see them, I should either click "References" or "+Show References". I am trying to find references link and click it.
Here is my code:
browser.get('https://doi.org/10.3847/1538-4357/abb3c9')
refCheck = ["references", "cited literature", "literature cited", "refs"]
for h in range(0, len(browser.find_elements(By.XPATH, '//a[#href]'))):
textSearch = browser.find_elements(By.XPATH, '//a[#href]')[h].text
href = browser.find_elements(By.XPATH, '//a[#href]')[h].get_attribute("href")
if (textSearch.lower() in refCheck) & (len(href) > 0):
browser.find_elements(By.XPATH, '//a[#href]')[h].get_attribute("href")
print(h)
print(textSearch)
print(href)
break
browser.get(href)
attrList = []
refCheck = ["references", "cited literature", "literature cited", "refs"]
tags = ["ol","ul"]
for t in tags:
if len(browser.find_elements(By.TAG_NAME, t)) > 0:
for i in range(0, len(browser.find_elements(By.TAG_NAME, t))):
for attr in browser.find_elements(By.TAG_NAME, t)[i].get_property('attributes'):
for rc in refCheck:
if (rc in attr['name'].lower()) | (rc in attr['value'].lower()):
attrList.append(t)
attrList.append(i)
attrList.append(attr['name'])
attrList.append(attr['value'])
print(attr['name'])
print(attr['value'])
print(len(browser.find_elements(By.TAG_NAME, t)[i].find_elements(By.XPATH,'./li')))
if len(attrList) > 0:
break
if len(attrList) > 0:
break
if len(attrList) > 0:
break
cnt = 0
for f in browser.find_elements(By.TAG_NAME, t)[i].find_elements(By.XPATH, './li'):
print(f.text)
if len(f.text) > 0:
refList.append(f.text)
cnt += 1
print(cnt)
However, the returned text is always empty.
PS. By the way, I have tried to click href I reached instead of browser.get(href), however it does not work as well. When I tried to get the hyperlink through get_attributes("href"), it always returned a string so could not click.
How should I get that text?
EDIT:
Found the answer here: link
Using get_attribute("textContent") solved my issue.
You can use alternatively crossref.org API if you search by DOI it will give you a JSON response that contains 'reference'. You can play with the json however you want.
import requests
def get_ref(doi):
url = f'https://api.crossref.org/works/{doi}'
response = requests.get(url)
if response.status_code == 200:
response = response.json()
return response['message']['reference']
return None
doi = 'doi.org/10.3847/1538-4357/abb3c9'
get_ref_count(doi)

Calculate percentage change in pandas with rows that contain the same values

I am using Pandas to calculate percentage change(s) between values that occur more than once in the column of interest.
I want to compare the values of last weeks workout provided they're the same exercise type to get the percentage change of (weight used, reps accomplished )
I am able to get the percentages of all the rows which is halfway what I want but the conditional part is missing - so only get the percentages if the exercise_name is of the same value as we want to compare how we improve on a weekly, bi-weekly basis.
ids = self.user_data["exercise"].fillna(0)
dups = self.user_data[ids.isin(ids[ids.duplicated()])].sort_values("exercise")
dups['exercise'] = dups['exercise'].astype(str)
dups['set_one_weight'] = pd.to_numeric(dups['set_one_weight'])
dups['set_two_weight'] = pd.to_numeric(dups['set_two_weight'])
dups['set_three_weight'] = pd.to_numeric(dups['set_three_weight'])
dups['set_four_weight'] = pd.to_numeric(dups['set_four_weight'])
dups['set_one'] = pd.to_numeric(dups['set_one'])
dups['set_two'] = pd.to_numeric(dups['set_two'])
dups['set_three'] = pd.to_numeric(dups['set_three'])
dups['set_four'] = pd.to_numeric(dups['set_four'])
**percent_change = dups[['set_three_weight']].pct_change()**
the last line gets the percentage change for all the rows for column set_three_weight but is unable to do what I want above which is find rows with same name and obtain the percentage change.
UPDATE
Using Group By Solution
ids = self.user_data["exercise"].fillna(0)
dups = self.user_data[ids.isin(ids[ids.duplicated()])].sort_values("exercise")
dups['exercise'] = dups['exercise'].astype(str)
dups['set_one_weight'] = pd.to_numeric(dups['set_one_weight'])
dups['set_two_weight'] = pd.to_numeric(dups['set_two_weight'])
dups['set_three_weight'] = pd.to_numeric(dups['set_three_weight'])
dups['set_four_weight'] = pd.to_numeric(dups['set_four_weight'])
dups['set_one'] = pd.to_numeric(dups['set_one'])
dups['set_two'] = pd.to_numeric(dups['set_two'])
dups['set_three'] = pd.to_numeric(dups['set_three'])
dups['set_four'] = pd.to_numeric(dups['set_four'])
dups['routine_upload_date'] = pd.to_datetime(dups['routine_upload_date'])
# percent_change = dups[['set_three_weight']].pct_change()
# Group the exercises together and create a new cols that represent the percentage delta variation in percentages
dups.sort_values(['exercise', 'routine_upload_date'], inplace=True, ascending=[True, False])
dups['set_one_weight_delta'] = (dups.groupby('exercise')['set_one_weight'].apply(pd.Series.pct_change) + 1)
dups['set_two_weight_delta'] = (dups.groupby('exercise')['set_two_weight'].apply(pd.Series.pct_change) + 1)
dups['set_three_weight_delta'] = (dups.groupby('exercise')['set_three_weight'].apply(pd.Series.pct_change) + 1)
dups['set_four_weight_delta'] = (dups.groupby('exercise')['set_four_weight'].apply(pd.Series.pct_change) + 1)
dups['set_one_reps_delta'] = (dups.groupby('exercise')['set_one'].apply(pd.Series.pct_change) + 1)
dups['set_two_reps_delta'] = (dups.groupby('exercise')['set_two'].apply(pd.Series.pct_change) + 1)
dups['set_three_reps_delta'] = (dups.groupby('exercise')['set_three'].apply(pd.Series.pct_change) + 1)
dups['set_four_reps_delta'] = (dups.groupby('exercise')['set_four'].apply(pd.Series.pct_change) + 1)
print(dups.head())
I think this gets me the result(s) I want, would like someone to confirm

list index out of range but it seems impossible since it's only after 3 questions

kanji = ['上','下','大','工','八','入','山','口','九','一','人','力','川','七','十','三','二','女',]
reading = ['じょう','か','たい','こう','はち','にゅう','さん','こう','く','いち','にん','りょく','かわ','しち','じゅう','さん','に','じょ']
definition = ['above','below','big','construction','eight','enter','mountain','mouth','nine','one','person','power','river','seven','ten','three','two','woman']
score = number_of_questions = kanji_item = 0
def question_format(prompt_type,lang,solution_selection):
global reading,definition,score,num_of_questions,kanji_item
question_prompt = 'What is the '+str(prompt_type)+' for "'+str(kanji[kanji_item])+'"? (Keyboard:'+str(lang)+')\n'
solution_selection = [reading,definition]
usr = input(question_prompt)
if usr in solution_selection[kanji_item] and kanji[kanji_item]:
score += 1
num_of_questions += 1
else:
pass
kanji_item += 1
while number_of_questions != 18:
question_format('READING','Japanese',[0])
print('You got ',score,'/',number_of_questions)
while number_of_questions != 36:
question_format('DEFINITION','English',[1])
print('You got ',score,'/',number_of_questions)
I can't get past 大. but I can't see where it's messing up. I've tried to change pretty much everything. "kanji_item" is supposed to give a common index number so that the answers can match up. It gets through the first two problems with no hassle, but for some reason refuses to accept my third problem.
Problems:
- wrong name using number_of_questions vs. num_of_questions
- wrong way to check truthyness if usr in solution_selection[kanji_item] and kanji[kanji_item]: - the last part is always True as it is a non empty string
- lots of globals wich is not considered very good style
It would be easier to zip your three list together so you get tuples of (kanji, reading, description) and feed 2 of those into your function depending on what you want to test. You do this 2 times, once for reading, once for description.
You can even randomize your list of tuples to get different "orders" in which questions are asked:
kanji = ['上', '下', '大', '工', '八', '入', '山', '口', '九', '一' , '人',
'力', '川', '七', '十', '三', '二', '女',]
reading = ['じょう', 'か', 'たい', 'こう', 'はち', 'にゅう', 'さん', 'こう', 'く',
'いち', 'にん', 'りょく', 'かわ', 'しち', 'じゅう', 'さん', 'に', 'じょ']
definition = ['above', 'below', 'big', 'construction', 'eight', 'enter', 'mountain',
'mouth', 'nine', 'one', 'person', 'power', 'river', 'seven', 'ten', 'three',
'two', 'woman']
import random
data = list(zip(kanji, reading, definition))
random.shuffle(data)
def question_format(prompt_type, lang, kanji, solution):
"""Creates a question about *kanji* - the correct answer is *solution*
Returns 1 if correct else 0."""
question_prompt = f'What is the {prompt_type} for {kanji}? (Keyboard: {lang})'
usr = input(question_prompt)
if usr == solution:
return 1
else:
return 0
questions_asked = 0
correct = 0
for (kanji, reading, _) in data:
correct += question_format('READING','Japanese', kanji, reading)
questions_asked += 1
print('You got ',correct,'/',questions_asked)
for (kanji, _, definition) in data:
correct += question_format('DEFINITION','ENGLISH', kanji, definition)
questions_asked += 1
print('You got ',correct,'/',questions_asked)
After zipping our list and shuffling them data looks like
[('山', 'さん', 'mountain'), ('女', 'じょ', 'woman'), ('力', 'りょく', 'power'),
('上', 'じょう', 'above'), ('九', 'く', 'nine'), ('川', 'かわ', 'river'),
('入', 'にゅう', 'enter'), ('三', 'さん', 'three'), ('口', 'こう', 'mouth'),
('二', 'に', 'two'), ('人', 'にん', 'person'), ('七', 'しち', 'seven'),
('一', 'いち', 'one'), ('工', 'こう', 'construction'), ('下', 'か', 'below'),
('八', 'はち', 'eight'), ('十', 'じゅう', 'ten'), ('大', 'たい', 'big')]

Python pyodbc fetchmany() how to select out put to update query

I have code to fetchmany() that will output eg:10 records
And i have added iterating value for each 0 1 2 3 4 5 for print statement , now i want user input 0 or 1 and it should select column. For those input so i can update sql record for those column
cur.execute("select events.SERIALNUM, emp.LASTNAME, emp.SSNO,
events.EVENT_TIME_UTC from AccessControl.dbo.emp,
AccessControl.dbo.events where emp.id = events.empid and emp.SSNO=?
order by EVENT_TIME_UTC desc ", empid)
rows = cur.fetchmany(att_date)
n = 0
for row in rows :
event_date = row.EVENT_TIME_UTC
utc = event_date.replace(tzinfo=from_zone)
utc_to_local = utc.astimezone(to_zone)
local_time = utc_to_local.strftime('%H:%M:%S')
att_date = utc_to_local.strftime('%d:%m:%y')
print (n, row.SERIALNUM, row.LASTNAME, row.SSNO, att_date, local_time)
n = n + 1
seri_al = input("Copy And Past the serial number u want to modifiy: ")
this will output following Data
0 1500448188 FIRST NAME 03249 2017-07-19 17:01:17
1 1500448187 FIRST NAME 03249 2017-07-19 17:01:15
Eg:
seri_al = input("Copy And Past the serial number u want to modifiy: ")
instead of copying and pasting '1500448188' these numbers I want the user to only enter '0' and map that one and update sql query as for where clause serial number.
It appears that you already know how to use input to prompt for the user's choice. The only piece you are missing is to add items to a dictionary as you loop through the rows. Here is a slightly abstracted example:
rows = [('1500448188',),('1500448187',)] # test data
selections = dict()
n = 0
for row in rows:
selections[n] = row[0]
print(n, repr(row[0]))
n += 1
select = input("Enter the index (0, 1, ...) you want to select: ")
selected_key = selections[int(select)]
print("You selected " + repr(selected_key))
which prints
0 '1500448188'
1 '1500448187'
Enter the index (0, 1, ...) you want to select: 1
You selected '1500448187'

Unknown column added in user input form

I have a simple data entry form that writes the inputs to a csv file. Everything seems to be working ok, except that there are extra columns being added to the file in the process somewhere, seems to be during the user input phase. Here is the code:
import pandas as pd
#adds all spreadsheets into one list
Batteries= ["MAT0001.csv","MAT0002.csv", "MAT0003.csv", "MAT0004.csv",
"MAT0005.csv", "MAT0006.csv", "MAT0007.csv", "MAT0008.csv"]
#User selects battery to log
choice = (int(input("Which battery? (1-8):")))
def choosebattery(c):
done = False
while not done:
if(c in range(1,9)):
return Batteries[c]
done = True
else:
print('Sorry, selection must be between 1-8')
cfile = choosebattery(choice)
cbat = pd.read_csv(cfile)
#Collect Cycle input
print ("Enter Current Cycle")
response = None
while response not in {"Y", "N", "y", "n"}:
response = input("Please enter Y or N: ")
cy = response
#Charger input
print ("Enter Current Charger")
response = None
while response not in {"SC-G", "QS", "Bosca", "off", "other"}:
response = input("Please enter one: 'SC-G', 'QS', 'Bosca', 'off', 'other'")
if response == "other":
explain = input("Please explain")
ch = response + ":" + explain
else:
ch = response
#Location
print ("Enter Current Location")
response = None
while response not in {"Rack 1", "Rack 2", "Rack 3", "Rack 4", "EV001", "EV002", "EV003", "EV004", "Floor", "other"}:
response = input("Please enter one: 'Rack 1 - 4', 'EV001 - 004', 'Floor' or 'other'")
if response == "other":
explain = input("Please explain")
lo = response + ":" + explain
else:
lo = response
#Voltage
done = False
while not done:
choice = (float(input("Enter Current Voltage:")))
modchoice = choice * 10
if(modchoice in range(500,700)):
vo = choice
done = True
else:
print('Sorry, selection must be between 50 and 70')
#add inputs to current battery dataframe
log = pd.DataFrame([[cy,ch,lo,vo]],columns=["Cycle", "Charger", "Location", "Voltage"])
clog = pd.concat([cbat,log], axis=0)
clog.to_csv(cfile, index = False)
pd.read_csv(cfile)
And I receive:
Out[18]:
Charger Cycle Location Unnamed: 0 Voltage
0 off n Floor NaN 50.0
Where is the "Unnamed" column coming from?
There's an 'unnamed' column coming from your csv. The reason most likely is that the lines in your input csv files end with a comma (i.e. your separator), so pandas interprets that as an additional (nameless) column. If that's the case, check whether your lines end with your separator. For example, if your files are separated by commas:
Column1,Column2,Column3,
val_11, val12, val12,
...
Into:
Column1,Column2,Column3
val_11, val12, val12
...
Alternatively, try specifying the index column explicitly as in this answer. I believe some of the confusion stems from pandas concat reordering your columns .

Resources