So I'm testing my knowledge on webscraping by writing this little script (I know some things are redundant, it's just me playing around with things):
def programstart():
import re
from selenium import webdriver
from bs4 import BeautifulSoup
ff=webdriver.Firefox()
ff.get('https://www.ozbargain.com.au/deals')
linkpattern = r'https://www.ozbargain.com.au/node/.*'
donotpassthislink = 'https://www.ozbargain.com.au/node/415763'
#list of found lnks
titlelist = []
#loop to find links
while True:
soup = BeautifulSoup(ff.page_source)
raw_links = soup.find_all(name='a', href=re.compile(linkpattern))
for links in raw_links:
titles = ff.find_element_by_xpath("//h2[#class='title']").text
extract = links
if donotpassthislink == extract:
break
else:
pass
titlelist.append(titles)
# Load next page
if donotpassthislink == extract: # stop loop at first movie link from last time.
break
else:
ff.get(ff.find_element_by_xpath("//a[#class='pager-next active']").get_attribute('href'))
programstart()
I get this error:
Traceback (most recent call last):
File "F:/Dropbox/Funfile/ben.py", line 35, in <module>
programstart()
File "F:/Dropbox/Funfile/ben.py", line 29, in programstart
if donotpassthislink == extract: # stop loop at first movie link from last time.
UnboundLocalError: local variable 'extract' referenced before assignment
I find this weird since this loop was used in another script that works perfectly.
So I write a simplified script:
test = 'lets break this sentence up'
list = []
def func():
while True:
for i in test:
extract = i
print(extract)
if extract == 'u':
break
else:
pass
list.append(extract)
if extract == 'u':
print('done')
break
func()
And it executes without any issues.
What's going on?
Related
I've been programming for a while in python but this is my first in multiprocessing.
I made a program that scrapes a local weather station for the ambient temperature using beautifulsoup4 every minute. The program also reads temperatures from several sensors and uploads everything to a Mysql database. This all works fine but on occasion (once every day) getting the data from the local weather station fails in retrieving the webpage. This causes beautifulsoup to start an infinite loop which effectively stops all functionality of the program. To combat this I tried to try my hand on multiprocessing.
I've coded a check that kills the extra thread if that is still running after 10 seconds. Here is where things go wrong, normally the beautifulsoup thread closes after 2-4 seconds when its finished. However in the case where the beautifulsoup gets stuck in its loop not only the thread is terminated but the entire program stops doing stuff altogether.
I've copied the relevant snippets of code. Please note that some vars are declared outside of the snippets, the code works with exception of the problem described above. Btw I am very much aware that there is a plethora of ways to make my code more efficient. Refining the code is something that I'll do when its working stable :) Thanks in advance for your help!
Imports:
...
from multiprocessing import Process, Queue
import multiprocessing
from bs4 import BeautifulSoup #sudo apt-get install python3-bs4
Beutifulsoup section:
def get_ZWS_temp_out(temp):
try:
if 1==1:
response = requests.get(url)
responsestr = str(response)
if "200" in responsestr:
soup = BeautifulSoup(response.content, 'html.parser')
tb = soup.findAll("div", {"class": "elementor-element elementor-element-8245410 elementor-widget__width-inherit elementor-widget elementor-widget-wp-widget-live_weather_station_widget_outdoor"})
tb2 = tb[0].findAll("div", {"class": "lws-widget-big-value"})
string = str(tb2[0])[-10:][:4]
stringt = string[:1]
if stringt.isdigit() == True:
#print("getal ok")
string = string
elif stringt == '-':
#print("minteken")
string = string
elif stringt == '>':
#print("temp < 10")
string = string[-3:]
temp = float(string)
except Exception as error:
print(error)
Q.put(temp)
return(temp)
Main program:
Q = Queue()
while 1 == 1:
strings = time.strftime("%Y,%m,%d,%H,%M,%S")
t = strings.split(',')
time_numbers = [ int(x) for x in t ]
if last_min != time_numbers[4]:
targettemp = get_temp_target(targettemp)
p = Process(target=get_ZWS_temp_out, name="get_ZWS_temp_out", args=(ZWS_temp_out,))
p.start()
i = 0
join = True
while i < 10:
i = i + 1
time.sleep(1)
if p.is_alive() and i == 10: #checks to quit early otherwise another iteration
print(datetime.datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d %H:%M:%S"),": ZWS getter is running for too long... let's kill it...")
# Terminate ZWS query
p.terminate()
i = 10
join = False
if join == True:
p.join()
Thanks in advance for your time :)
I have to manually stop the program which gives the following output:
pi#Jacuzzi-pi:~ $ python3 /home/pi/Jacuzzi/thermometer.py
temperature sensors observer and saving program, updates every 3,5 seconds
2019-10-28 03:50:11 : ZWS getter is running for too long... let's kill it...
^CTraceback (most recent call last):
File "/home/pi/Jacuzzi/thermometer.py", line 283, in <module>
ZWS_temp_out = Q.get()
File "/usr/lib/python3.5/multiprocessing/queues.py", line 94, in get
res = self._recv_bytes()
File "/usr/lib/python3.5/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
KeyboardInterrupt
I believe your program is waiting infinitely to pull items from the queue you've created. I can't see the line in the code you've posted, but it appears in the error message:
ZWS_temp_out = Q.get()
Since the get_ZWS_temp_out process is the one that adds items to the queue, you need to make sure that the process is running before you call Q.get(). I suspect this line of code gets executed between the act of terminating the timed-out process and restarting a new process, where instead it should be called after the new process is created.
Based on what Rob found out this is the updated (working) code for the main program, the others are unchanged:
Q = Queue()
while 1 == 1:
strings = time.strftime("%Y,%m,%d,%H,%M,%S")
t = strings.split(',')
time_numbers = [ int(x) for x in t ]
if last_min != time_numbers[4]:
targettemp = get_temp_target(targettemp)
p = Process(target=get_ZWS_temp_out, name="get_ZWS_temp_out", args=(ZWS_temp_out,))
p.start()
i = 0
completion = True
while i < 10:
i = i + 1
time.sleep(1)
if p.is_alive() and i == 10: #checks to quit early otherwise another iteration
print(datetime.datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d %H:%M:%S"),": ZWS getter is running for too long... let's kill it...")
# Terminate ZWS query
p.terminate()
i = 10
completion = False
if completion == True:
p.join()
ZWS_temp_out = Q.get()
I am analyzing xml-structured Textfiles about insider dealings. I wrote some code to parse through the XML-structure and write my output in a CSV file. The results of the files are written per line and the analyzed information is written in individual columns. But in some files information is present in multiple times and my code override the information in the cells, in the end only one date is in the cell of my CSV-File.
import csv
import glob
import re
import string
import time
import bs4 as bs
# User defined directory for files to be parsed
TARGET_FILES = r'D:\files\'
# User defined file pointer to LM dictionary
# User defined output file
OUTPUT_FILE = r'D:\ouput\Parser.csv'
# Setup output
OUTPUT_FIELDS = [r'Datei', 'transactionDate', r'transactionsCode', r'Director', r'Officer', r'Titel', r'10-% Eigner', r'sonstiges', r'SignatureDate']
def main():
f_out = open(OUTPUT_FILE, 'w')
wr = csv.writer(f_out, lineterminator='\n', delimiter=';')
wr.writerow(OUTPUT_FIELDS)
file_list = glob.glob(TARGET_FILES)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
soup = bs.BeautifulSoup(f_in, 'xml')
output_data = get_data(soup)
output_data[0] = file
wr.writerow(output_data)
def get_data(soup):
# overrides the transactionDate if more than one transactions disclosed on the current form
# the number determine the column for the output
_odata = [0] * 9
try:
for item in soup.find_all('transactionDate'):
_odata[1] = item.find('value').text
except AttributeError:
_odata[1] = ('keine Angabe')
try:
for item in soup.find_all('transactionAcquiredDisposedCode'):
_odata[2] = item.find('value').text
except AttributeError:
_odata[2] = 'ka'
for item in soup.find_all('reportingOwnerRelationship'):
try:
_odata[3] = item.find('isDirector').text
except AttributeError:
_odata[3] = ('ka')
try:
_odata[4] = item.find('isOfficer').text
except AttributeError:
_odata[4] = ('ka')
try:
_odata[5] = item.find('officerTitle').text
except AttributeError:
_odata[5] = 'ka'
try:
_odata[6] = item.find('isTenPercentOwner').text
except AttributeError:
_odata[6] = ('ka')
try:
_odata[7] = item.find('isOther').text
except AttributeError:
_odata[7] = ('ka')
try:
for item in soup.find_all('ownerSignature'):
_odata[8] = item.find('signatureDate').text
except AttributeError:
_odata[8] = ('ka')
return _odata
if __name__ == '__main__':
print('\n' + time.strftime('%c') + '\nGeneric_Parser.py\n')
main()
print('\n' + time.strftime('%c') + '\nNormal termination.')
Actually the code works, but overwrites columns if, for e.g. more than one transacion date is given in the file. So I need a code that automatically uses the next column for each transaction date. How could this work?
I would be glad if someone have a solution for my problem. Thanks a lot!
Your issue is that you are iterating over the result of
soup.find_all()
and every time you are writing to the same value. You need to do something with
_odata in each iteration, otherwise you will only end up with whatever is written to it the last time.
If you can show us what the data you're trying to parse actually looks like, perhaps we could give a more specific answer.
I was working on some code which retrieves a line in a text file ("save[#]) with the format:
"[name],[boolean or integer value]"
(The aim is to be able to retrieve it for save states of a game)
The issue is that whenever I try to return a value from my module I get the following:
Traceback (most recent call last):
File "//IHS-FS-001.ihs.local/Cohort2020$/2ELGAG1/python/srctg/test.py", line 5, in <module>
retrieve()
File "//IHS-FS-001.ihs.local/Cohort2020$/2ELGAG1/python/srctg/test.py", line 3, in retrieve
if retrieve.check("test", 1) == True:
AttributeError: 'function' object has no attribute 'check'
The test attribute is a testing module set up to test the code for the game:
import retrieve.py
def retrieve():
if retrieve.check("test", 1) == True:
return True
retrieve()
The retrieve attribute itself is set up like so:
import error
def check(i_name, save):
save = str(save)
save_n = "save" + save + ".txt"
save_f = open(save_n, "r")
list = save_f.readlines()
for item in range(len(list)):
list[item] = list[item].strip()
list[item] = list[item].split(",")
list[item][1] = list[item][1]
for item in range(len(list)):
if i_name == list[item][0]:
i_return = list[item][1]
if bool_check(i_return) == True:
i_return = bool(i_return)
elif int_check(i_return) == True:
i_return = int(i_return)
else:
print(error.code("001"))
return "error"
return i_return
def int_check(value):
while True:
try:
value = int(value)
return True
break
except ValueError:
return False
break
def bool_check(value):
while True:
try:
value = bool(value)
return True
break
except ValueError:
return False
break
Don't include the .py in the import. This tries to import a module named py inside the package named retrieve, which is probably not what you meant.
import retrieve as retrieve_module
def retrieve():
if retrieve_module.check("test", 1) == True:
return True
Also, don't write a function with the same name as the module you just imported. Change the name of one or the other. That's why it can't find the .check attribute. It's looking inside the retrieve function instead of inside the retrieve module because you overwrote it in the global namespace (by executing the function definition) before you called the function.
I cannot figure out how to get python to print all the highest values as it only prints the first one it encounters.
It takes standard input from a file that has on a few lines the following:
89 Michael Dunne (grade name)
I know I can use the zip function but I cannot figure out how only print the name from it
If I add "highstudents = sorted(zip(grade,name),reverse=True)" it sorts from high to low but I do not know how to filter the name out as it prints as "(89, 'Pepe')"
The code below is the following attempt so far.
import sys
def topgrade(x):
s = max(x)
return s
def main():
s = sys.argv[1]
grade=[]
name = []
try:
with open(s,'r') as studata:
for line in studata:
try:
line = line.strip()
grade.append(int(line[0:2]))
name.append(line[3::])
except ValueError:
print("Invalid mark",line[0:2],"encountered. Skipping.")
top = topgrade(grade)
a = grade.index(top)
print("Best students:",name[a])
print("Best mark:",top)
except FileNotFoundError:
print("File not found:",s)
if __name__ == '__main__':
main()
Rather than trying to keep the students and marks in 2 separate lists (with the risk that they get out of step) it is better to use a dictionary - where the key is the mark and the value is a list of the student(s) who obtained that mark.
Then it is a simple task of just printing out the highest key, and the associated list of students. I'm using defaultdict as an easier option than having to create or append to the list for each value.
from collections import defaultdict
import sys
def main():
s = sys.argv[1]
grades = defaultdict(list)
try:
with open(s,'r') as studata:
for line in studata:
try:
line = line.strip()
grades[int(line[0:2])].append(line[3::])
except ValueError:
print("Invalid mark",line[0:2],"encountered. Skipping.")
top_mark = max(grades.keys())
print("Best students:{}".format(','.join(grades[top_mark])))
print("Best mark: {}".format(top_mark))
except FileNotFoundError:
print("File not found:",s)
if __name__ == '__main__':
main()
I am working on an AI, like Jarvis in python3. I am using the python speech_recognition module and pyaudio and everything else required acording to this page.
https://pypi.python.org/pypi/SpeechRecognition/
I have it on a raspberry pi now, before i was using my mac which was working fine. Now sometimes i get an error when running my Jarvis code on my Raspberry pi! Not always but frequetly enough to put a wrench in our progress. And not knowing when the error will come is a big problem and we need to get rid of it. Iḿ using a blue Snowball mic. Here is my code and my error if you could help, that would be great thanks!
Traceback (most recent call last):
File "/media/pi/TRAVELDRIVE/Jarvis(10.0).py", line 172, in <module>
with m as source: r.adjust_for_ambient_noise(source)
File "/usr/local/lib/python3.4/dist-packages/speech_recognition/__init__.py", line 140, in __enter__
input=True, # stream is an input stream
File "/usr/local/lib/python3.4/dist-packages/pyaudio.py", line 750, in open
stream = Stream(self, *args, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/pyaudio.py", line 441, in __init__
self._stream = pa.open(**arguments)
OSError: [Errno -9999] Unanticipated host error
Jarvis.py
#JARVIS mark 10. python 3.5.1 version
#JUST.A.RATHER.VERY.INTELEGENT.SYSTEM.
##import speech_recognition
##import datetime
##import os
##import random
##import datetime
##import webbrowser
##import time
##import calendar
from difflib import SequenceMatcher
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import PunktSentenceTokenizer
import speech_recognition as sr
import sys
from time import sleep
import os
import random
r = sr.Recognizer()
m = sr.Microphone()
#Brain functions, vocab!
what_i_should_call_someone = [""]
Good_Things = ["love","sweet","nice","happy","fun","awesome","great"]
Bad_Things = ["death","kill","hurt","harm","discomfort","rape","pain","sad","depression","depressed","angry","mad","broken","raging","rage"]
# Words that you might says in the beginning of your input, for example: "um hey where are we!?!"
Slang_Words = ["um","uh","hm","eh"]
# Put all greetings in here
Static_Greetings = ["Hey","Hi","Hello"]
# Put your AIs Name and other names just in case.
Name = ["jarvis"]
posible_answer_key_words = ["becuase","yes","no"]
Chance_that_question_was_asked_1 = 0
Chance_that_question_was_asked_2 = 0
certainty_question_was_asked = 0
Me_statment_keywords = ["you","your","yours"]
You_statment_keywords = ["i","i'm","me"]
global certainty_person_is_talking_to_me
what_i_said = ("")
Just_asked_querstion = False
the_last_thing_i_said = ("")
the_last_thing_person_said = ("")
what_person_said = ("")
what_person_said_means = [""]
what_im_about_to_say = [""]
why_im_about_to_say_it = [""]
who_im_talking_to = [""]
how_i_feel = [""]
why_do_i_feel_the_way_i_do = [""]
what_i_am_thinking = ("")
# ways to describe the nouns last said
it_pronouns = ["it","they","she","he"]
# last person place or thing described spoken or descussed!
last_nouns = [""]
# Sample of random questions so Jarvis has somthing to index to know what a question is!
Sample_Questions = ["what is the weather like","where are we today","why did you do that","where is the dog","when are we going to leave","why do you hate me","what is the Answer to question 8",
"what is a dinosour","what do i do in an hour","why do we have to leave at 6.00", "When is the apointment","where did you go","why did you do that","how did he win","why won’t you help me",
"when did he find you","how do you get it","who does all the shipping","where do you buy stuff","why don’t you just find it in the target","why don't you buy stuff at target","where did you say it was",
"when did he grab the phone","what happened at seven am","did you take my phone","do you like me","do you know what happened yesterday","did it break when it dropped","does it hurt everyday",
"does the car break down often","can you drive me home","where did you find me"
"can it fly from here to target","could you find it for me"]
Sample_Greetings = ["hey","hello","hi","hey there","hi there","hello there","hey jarvis","hey dude"]
Question_Keyword_Answer = []
Int_Question_Keywords_In_Input = []
Possible_Question_Key_Words = ["whats","what","where","when","why","isn't","whats","who","should","would","could","can","do","does","can","can","did"]
Possible_Greeting_Key_Words = ["hey","hi","hello",Name]
# In this function: Analyze the user input find out if it's (Question, Answer, Command. Etc) and what is being: Asked, Commanded, ETC.
def Analyze():
def Analyze_For_Greeting():
def Greeting_Keyword_Check():
global Possible_Greeting_Key_Words
Int_Greeting_Keywords_In_Input = []
for words in what_person_said_l_wt:
if words in Possible_Greeting_Key_Words:
Int_Greeting_Keywords_In_Input.append(words)
Amount_Greeting_Keywords = (len(Int_Greeting_Keywords_In_Input))
if Amount_Greeting_Keywords > 0:
return True
def Greeting_Sentence_Match():
for Ran_Greeting in Sample_Greetings:
Greeting_Matcher = SequenceMatcher(None, Ran_Greeting, what_person_said_l).ratio()
if Greeting_Matcher > 0.5:
print (Greeting_Matcher)
print ("Similar to Greeting: "+Ran_Greeting)
return True
Greeting_Keyword_Check()
Greeting_Sentence_Match()
#In this function: determin if the input is a question or not.
def Analyze_For_Question():
# In this function: if there is atleast one question keyword in the user input then return true.
def Question_Keyword_Check():
global Possible_Question_Key_Words
Int_Question_Keywords_In_Input = []
for words in what_person_said_l_wt:
if words in Possible_Question_Key_Words:
Int_Question_Keywords_In_Input.append(words)
Amount_Question_keywords = (len(Int_Question_Keywords_In_Input))
if Amount_Question_keywords > 0:
return True
# In this function: if the users input is simular to other sample questions, return true.
def Question_Sentence_Match():
for Ran_Question in Sample_Questions:
Question_Matcher = SequenceMatcher(None, Ran_Question, what_person_said_l).ratio()
if Question_Matcher > 0.5:
print (Question_Matcher)
print ("Similar to Question: "+Ran_Question)
return True
# In this function: if the first word of the users input is a question keyword and there is a different question keyword in the input return true.
def Question_Verb_Noun_Check():
#if you say "hey jarvis" before somthing like a question or command it will still understand
try:
for word in what_person_said_l_wt:
if word in Static_Greetings or word in Name:
print (word)
Minus_Begin_Greet1 = what_person_said_l_wt.remove(word)
print (Minus_Begin_Greet1)
return True
except IndexError:
pass
Question_Keyword_Check()
Question_Sentence_Match()
Question_Verb_Noun_Check()
if Question_Keyword_Check()==True and Question_Sentence_Match()==True and Question_Verb_Noun_Check()==True:
return True
else:
return False
# All the funtions in Analyze
Analyze_For_Greeting()
Analyze_For_Question()
Conversation=True
Conversation_Started=False
while Conversation==True:
try:
if Conversation_Started==False:
#Greeting()
Conversation_Started=True
with m as source: r.adjust_for_ambient_noise(source)
print(format(r.energy_threshold))
print("Say something!") # just here for now and testing porposes so we know whats happening
with m as source: audio = r.listen(source)
print("Got it! Now to recognize it...")
try:
# recognize speech using Google Speech Recognition
value = r.recognize_google(audio)
# we need some special handling here to correctly print unicode characters to standard output
if str is bytes: # this version of Python uses bytes for strings (Python 2)
print(u"You said {}".format(value).encode("utf-8"))
else: # this version of Python uses unicode for strings (Python 3+)
print("You said {}".format(value))
what_person_said_l = value.lower()
what_person_said_l_wt = word_tokenize(what_person_said_l)
Analyze()
except sr.UnknownValueError:
print ("what was that?")
except sr.RequestError as e:
print("Uh oh! Sorry sir Couldn't request results from Google Speech Recognition service; {0}".format(e))
except KeyboardInterrupt:
pass