I want to scrape 2000 twitter's tweets at least by below code. However, it will spend too many time around 12 hours.
My question is how to reduce time through the Threading, Multiprocesing or anything else?
from selenium.common.exceptions import StaleElementReferenceException
import time
def first_count(text):
characters = text.split()
count = 0
for character in characters:
if character.lower() == 'i' or character.lower() == 'me' or character.lower() == 'my' or character.lower() == 'mine' or character.lower() == 'myself':
count += 1
return count
def third_count(text):
characters = text.split()
count = 0
for character in characters:
if character.lower() == 'our' or character.lower() == 'us' or character.lower() == 'we' or character.lower() == 'ourselves':
count+= 1
return count
def sum_counts(text):
characters = text.split()
return len(characters)
def get_tweet(url):
i_count = 0
w_count = 0
sum_count = 0
driver_path = 'D:\scappppp\chromedrive\chromedriver.exe'
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(executable_path=driver_path, options = options)
driver.get(url)
# max_tweets = driver.find_element_by_xpath('.//div[#class="css-1dbjc4n r-16y2uox r-1wbh5a2 r-1pi2tsx r-1777fci"]//div[#class="css-901oao css-bfa6kz r-14j79pv r-1qd0xha r-n6v787 r-16dba41 r-1cwl3u0 r-bcqeeo r-qvutc0"]')
tweets_dict = []
'set threadholder of the number of tweets will be getting'
while len(tweets_dict) < 100:
if driver.current_url == url:
tweets = driver.find_elements_by_xpath('//div[#lang]')
try:
for intermedium in tweets:
tweet_1 =''.join([span for span in intermedium.text])
tweet = tweet_1.lower().replace('\n',' ').replace('#','').replace('#','')
if tweet not in tweets_dict:
tweets_dict.append(tweet)
# print(tweets, '\n====================')
i_count += first_count(tweet)
w_count += third_count(tweet)
sum_count += sum_counts(tweet)
else:
pass
except StaleElementReferenceException:
pass
body = driver.find_element_by_css_selector('body')
body.send_keys(Keys.PAGE_DOWN)
```if login iframe dispaly, remove it```
else:
login_button = driver.find_element_by_xpath('.//div[#class="css-1dbjc4n r-14lw9ot r-16y2uox r-1dqxon3 r-16wqof r-11nfnuw"]//div[#role="button"]')
login_button.click()
back_button = driver.find_element_by_xpath('.//div[#class="css-1dbjc4n"]//div[#class="css-901oao r-1awozwy r-18jsvk2 r-6koalj r-18u37iz r-16y2uox r-1qd0xha r-a023e6 r-b88u0q r-1777fci r-rjixqe r-bcqeeo r-q4m81j r-qvutc0"]')
back_button.click()
driver.quit()
#return tweets_dict,i_count,w_count,sum_count
def main():
df = open('ava1.txt','r')
urls = df.read().split('\n')
for i,url in enumerate(urls):
get_tweet(url)
if __name__ = 'main':
mian()
Related
In the function of find_shortest_func, i think if now position isn't "T" which is also known as the terminal or exit, then i will try to find for direction and see if it is "T", if not, check if it is space and i can go there. Besides, tell the next state function now output and dic to tell the place where i visited. But some errors occur and I don't know why.
I think the problem may occur where I tried to deepcopy the output list
import copy
def set_symbol(symbol_name):
def set_symbol_decorator(func):
def wrapper(self, symbol):
setattr(self, symbol_name, symbol)
return wrapper
return set_symbol_decorator
class Maze:
space_symbol = " "
obstacle_symbol = "X"
path_symbol = "•"
output = []
dis = 0
def __init__(self, input_string):
self.maze = []
if input_string.endswith("txt"):
with open(input_string) as f:
count = 0
for line in f.readlines():
self.maze.append([])
for j in line:
if j != '\n':
self.maze[count].append(j)
count += 1
else:
count = 0
for i in input_string.split("\n"):
self.maze.append([])
for j in i:
self.maze[count].append(j)
count += 1
def __str__(self):
output_string = ""
for i in range(20):
for j in range(20):
output_string += self.maze[i][j]
output_string += "\n"
return output_string
#set_symbol("space_symbol")
def set_space_symbol(self, change):
pass
#set_symbol("obstacle_symbol")
def set_obstacle_symbol(self, change):
pass
#set_symbol("path_symbol")
def set_path_symbol(self, change):
pass
def find_shortest_func(self, position: tuple, d: dict, out: list, dis: int):
dic = copy.deepcopy(d)
output = copy.deepcopy(out)
dic[(position[0], position[1])] = 1
output.append((position[0], (position[1])))
dis += 1
if self.maze[position[0]][position[1]] != "T":
if position[0]+1 < 20 and self.maze[position[0]+1][position[1]] == self.space_symbol and (position[0]+1, position[1]) not in dic:
self.find_shortest_func(
(position[0]+1, position[1]), dic, output, dis)
if position[1]+1 < 20 and self.maze[position[0]][position[1]+1] == self.space_symbol and (position[0], position[1]+1) not in dic:
self.find_shortest_func(
(position[0], position[1]+1), dic, output, dis)
if position[0]-1 >= 0 and self.maze[position[0]-1][position[1]] == self.space_symbol and (position[0]-1, position[1]) not in dic:
self.find_shortest_func(
(position[0]-1, position[1]), dic, output, dis)
if position[1]-1 >= 0 and self.maze[position[0]][position[1]-1] == self.space_symbol and (position[0], position[1]-1) not in dic:
self.find_shortest_func(
(position[0], position[1]-1), dic, output, dis)
if self.maze[position[0]][position[1]] == "T":
if dis < self.dis:
self.output = copy.deepcopy(output)
self.dis = dis
return
def find_shortest_path(self):
d = dict()
output = []
dis = -1
self.find_shortest_func((1, 0), d, output, dis)
return self.output, self.dis
I took this nice example of a simple curses application with a list. I wanted to make it scrollable, so I changed the part of the list that gets shown. However, I can scroll down and back up, but the contents shown doesn't change (only the highlighted line, not the lines shown).
What am I doing wrong?
MVCE
#!/usr/bin/env python
import curses
from curses import panel
class Menu(object):
def __init__(self, items, stdscreen):
self.window = stdscreen.subwin(0, 0)
self.window.keypad(1)
self.panel = panel.new_panel(self.window)
self.panel.hide()
panel.update_panels()
self.position = 0
self.items = items
def navigate(self, n):
self.position += n
if self.position < 0:
self.position = 0
elif self.position >= len(self.items):
self.position = len(self.items) - 1
def display(self):
self.panel.top()
self.panel.show()
self.window.clear()
while True:
self.window.refresh()
curses.doupdate()
start = 0
# The next 3 lines seem not to work as intended
while start + (curses.LINES - 1) < self.position:
start += curses.LINES
for index, item in enumerate(self.items[start:curses.LINES - 1], start=start):
if index == self.position:
mode = curses.A_REVERSE
else:
mode = curses.A_NORMAL
msg = "%d. %s" % (index, item[0])
self.window.addstr(1 + index, 1, msg, mode)
key = self.window.getch()
if key in [curses.KEY_ENTER, ord("\n"), curses.KEY_RIGHT]:
self.items[self.position][1]()
elif key == curses.KEY_UP:
self.navigate(-1)
elif key == curses.KEY_DOWN:
self.navigate(1)
elif key == curses.KEY_LEFT:
break
self.window.clear()
self.panel.hide()
panel.update_panels()
curses.doupdate()
class MyApp(object):
def __init__(self, stdscreen):
self.screen = stdscreen
curses.curs_set(0)
submenu_items = [("beep", curses.beep), ("flash", curses.flash)]
submenu = Menu(submenu_items, self.screen)
main_menu_items = [
("beep", curses.beep),
("flash", curses.flash),
("submenu", submenu.display),
]
for i in range(200):
main_menu_items.append((f"flash {i}", curses.flash))
main_menu = Menu(main_menu_items, self.screen)
main_menu.display()
if __name__ == "__main__":
curses.wrapper(MyApp)
Basically that's because you're not updating the upper limit on the slice used in this loop:
for index, item in enumerate(self.items[start:curses.LINES - 1], start=start):
Here's a better version
MVCE
#!/usr/bin/env python
import curses
from curses import panel
class Menu(object):
def __init__(self, items, stdscreen):
self.window = stdscreen.subwin(0, 0)
self.window.keypad(1)
self.panel = panel.new_panel(self.window)
self.panel.hide()
panel.update_panels()
self.position = 0
self.items = items
def navigate(self, n):
self.position += n
if self.position < 0:
self.position = 0
elif self.position >= len(self.items):
self.position = len(self.items) - 1
def display(self):
self.panel.top()
self.panel.show()
self.window.clear()
while True:
start = 0
self.window.clear()
while start + (curses.LINES - 1) < self.position:
start += curses.LINES
myrow = self.position - start
mycol = 0
for index, item in enumerate(self.items[start:start + curses.LINES - 1], start=start):
if index == self.position:
mode = curses.A_REVERSE
else:
mode = curses.A_NORMAL
msg = "%d. %s" % (index, item[0])
self.window.addstr(index - start, 0, msg, mode)
if index == self.position:
(myrow, mycol) = self.window.getyx()
self.window.move(myrow, mycol)
key = self.window.getch()
if key in [curses.KEY_ENTER, ord("\n"), curses.KEY_RIGHT]:
self.items[self.position][1]()
elif key == curses.KEY_UP:
self.navigate(-1)
elif key == curses.KEY_DOWN:
self.navigate(1)
elif key == curses.KEY_LEFT:
break
self.window.clear()
self.panel.hide()
panel.update_panels()
curses.doupdate()
class MyApp(object):
def __init__(self, stdscreen):
self.screen = stdscreen
curses.curs_set(1)
submenu_items = [("beep", curses.beep), ("flash", curses.flash)]
submenu = Menu(submenu_items, self.screen)
main_menu_items = [
("beep", curses.beep),
("flash", curses.flash),
("submenu", submenu.display),
]
for i in range(200):
main_menu_items.append((f"flash {i}", curses.flash))
main_menu = Menu(main_menu_items, self.screen)
main_menu.display()
if __name__ == "__main__":
curses.wrapper(MyApp)
I'm trying to create Scrapy-app, when the app execute the function 'parse' it goes well(status=200), but when it call 'parse_phone' it can't get that url and logs with the errors.
When i execute 'scrapy shell 'url that is in parse_phone' it executes without problems.
Can anyone answer what is solution?
class DeviceSpider(scrapy.Spider):
name = 'device'
start_urls = [
'https://www.gsmarena.com/makers.php3',
]
def parse(self, response):
for href in response.css('.st-text a::attr(href)'):
time.sleep(random.randint(30,50))
yield response.follow(href, self.parse_phones)
def parse_phones(self, response):
for href in response.css('#review-body a::attr(href)'):
time.sleep(random.randint(30,50))
yield response.follow(href, self.parse_device_info)
next_page = response.css('.pages-next::attr(href)').extract_first()
if next_page is not None:
time.sleep(random.randint(30,50))
next_page = response.urljoin(next_page)
yield response.follow(href, self.parse_phones)
def parse_device_info(self, response):
price = response.css('td[data-spec=price]::text').get()()
models = response.css('td[data-spec=models]::text').get(
name = response.css('.specs-phone-name-title::text').get)
Object.objects.create(name='werw')
if models:
models = models.split(', ')
else:
models = ['{}'.format(name)]
launch = response.css('td[data-spec=year]::text').get()
if launch:
launch = launch.split(', ')
obj = Object.objects.create(name=name, launch=datetime.date(launch[0], datetime.datetime.strptime(launch[1][:3]).month, 1))
else:
obj = Object.objects.create(name=name)
for m in models:
item = Item.objects.create(obj=obj, name=m)
if price:
if 'euro' in p.lower():
Attr.objects.create(item=item, price=Money(float(p.split(' ')[1]), 'EUR'))
else:
price = price.split(' / ')
price = list(map(lambda x: x.replace('\u2009', ''),price))
for p in prices:
price_currency = p[0]
price_amount = float(p.split('{}'.format(p[0]))[1])
bank_symbols_currency = ''
if price_currency == '$':
bank_symbols_currency = 'USD'
elif price_currency == '€':
bank_symbols_currency = 'EUR'
elif price_currency == '£':
bank_symbols_currency = 'GBP'
elif price_currency == '₹':
bank_symbols_currency = 'INR'
if bank_symbols_currency:
Attr.objects.create(item=item, price=Money(price_amount, bank_symbols_currency))
I tried with User-agent but it doesn't change the situation.
print("Welcome to my calorie counter program")def disp_menu():choice_list = ["a", "d", "m", "q"]
initial loop
while True:
print("What do you want to do?")
print("a = add an item")
print("d = delete an item")
print("q = quit")
choice = input("make a selection>")
if choice in choice_list:
return choice
else:
print("not a valid selection, try again")
while True:
choice = disp_menu()
if choice == "a":
tot_cals, item_cnt = add_process(tot_cals, item_cnt)
elif choice == "d":
del_item()
elif choice == "q":
break
disp_meal()
#create lists
item_list = [] #list of items ordered
cals_list = [] #
#create variables
cont = str("y")
item_cnt = int(0)
tot_cals = int(0)
#define functions here
#calculates calories per grams
def calc_cals(g_type, grams):
if g_type =="f":
return grams * 9
else:
return grams * 4
#first loop
while cont.lower() =="y":
valid_data = False #this is the boolean flag for data validation
#Capture input
while not valid_data:
item_name = input("please enter the item> ")
if len(item_name) > 20:
print("not a valid food name")
elif len(item_name) == 0:
print("you need to enter a name")
else:
valid_data = True
#reset the flag for the next data item
valid_data = False
while not valid_data:
try:g_carbs = int(input("enter grams of carbs> "))
except Exception is detail:
print("carbs error: ", detail)
else:
valid_data = True
#reset the flag for the next data item
valid_data = False
while not valid_data:
try:g_fats = int(input("enter grams of fats> "))
except Exception is detail:
print("fats error: ", detail)
else:
valid_data = True
valid_data = False
while not valid_data:
try:g_protein = int(input("enter grams of protein> "))
except Exception is detail:
print("protein error: ", detail)
else:
valid_data = True
new function
def add_process(tot_cals, item_cnt):
item_name = input_name()
g_carbs = input_grams("carbs")
g_fats = input_grams("fats")`
g_prot = input_grams("protein")
#Do the math
cals = calc_cals("c", g_carbs) + calc_cals("f", g_fats) + calc_cals("p", g_prot)
#output
print("total calories for {} are {}".format(item_name, cals))
incl = input("do you want to include {}? (y/n)>".format(item_name))
if incl.lower() == "y":
add_item(item_name, cals)
#accumulate totals
tot_cals = tot_cals + cals
item_cnt += 1 #shortcut method
print("item {} entered.".format(item_name))
else:
print("Item {} not entered.".format(item_name))
return tot_cals, item_cnt
#new function
def del_item():
if len(item_list == 0):
print("you have no items to delete")
else:
print("\nDelete an item")
disp_meal()
valid_data = False
while not valid_data:
try:
choice = int(input("Enter the item number you want to delete>"))
if 1<= choice <= len(item_list):
choice = choice - 1
print("Item {}. {} with {} calories will be deleted".format(choice + 1,
item_list[choice],
cals_list[choice]))
del item_list[choice]
del cals_list[choice]
valid_data = True
except Exception as detail:
print("error: ",detail)
print("try again")
#new function
def disp_meal():
print("\nMeal Calorie Counter")
print("Num\tItem\t\tcals")
print("---\t----\t\----")
meal_cals = 0
for c in range(len(item_list)):
meal_cals += cals_list[c]
print("{}.\t{}\t\t{}".format(c+1,item_list[c],
print("\nYour meal has {} items for a total of {} calories\n".format(len(item_list), meal_cals)
print("-" * 20)
#this last line gives me an eof error
I'll start by a simplified explanation of what my code (function) has to do:
I get a file with words in it that I have to code to morse code, so I have another file where each character has a morse code. My problem occurs when I want to put each word in a dictionary as a key with as value that morse code.
to get the pattern of the code and assign it to that word in my dictionary I use another function but it says 'UnboundLocalError: local variable 'patroon' referenced before assignment'
while 'patroon' is actually my other function:
def patroon(woord, morse_dict, complement = False, spiegel = False):
patroon_str = ''
for letter in woord:
patroon_str += morse_dict[letter.upper()]
i=0
if complement==True:
patroon_list = list(patroon_str)
for char in patroon_str:
if char == '.':
patroon_list[i]='-'
elif char == '-': patroon_list[i]='.'
i+=1
patroon_str = ''.join(patroon_list)
if spiegel == True:
patroon_str = patroon_str[::-1]
return patroon_str
and here is the function in which it is called:
def groepen(woordenloc, morseloc):
morse_dict = morsecodes(morseloc)
woordpatroon_dict = {}
woorden = open(woordenloc, 'r')
for woord in woorden:
***woordpatroon_dict[woord] = patroon(woord, morse_dict)***
patroonwoorden_dict = {}
for woord, patroon in woordpatroon_dict.items():
if patroon in patroonwoorden_dict:
patroonwoorden_dict[patroon].add(woord)
else:
patroonwoorden_dict[patroon] = {woord}
return patroonwoorden_dict
where the stars are is where the error occurs
I'm new to python so I don't really know if this would be enough information.
this is my full code:
def morsecodes(locatie):
morse_file = open(locatie, 'r')
morse_dict = {}
for line_str in morse_file:
new_l = line_str.split()
if new_l[0].isalpha:
morse_dict[new_l[0].upper()] = new_l[1]
else: morse_dict[new_l[0]] = new_l[1]
return morse_dict
def patroon(woord, morse_dict, complement = False, spiegel = False):
patroon_str = ''
for letter in woord:
patroon_str += morse_dict[letter.upper()]
i=0
if complement==True:
patroon_list = list(patroon_str)
for char in patroon_str:
if char == '.':
patroon_list[i]='-'
elif char == '-': patroon_list[i]='.'
i+=1
patroon_str = ''.join(patroon_list)
if spiegel == True:
patroon_str = patroon_str[::-1]
return patroon_str
def isomorse(woord1, woord2, morse_dict, complement = False, spiegel = False):
patroon1 = patroon(woord1, morse_dict)
patroon2 = patroon(woord2, morse_dict, complement, spiegel)
if patroon1 == patroon2: return True
else: return False
def groepen(woordenloc, morseloc):
morse_dict = morsecodes(morseloc)
woordpatroon_dict = {}
woorden = open(woordenloc, 'r')
for woord in woorden:
woordpatroon_dict[woord] = patroon(woord, morse_dict)
patroonwoorden_dict = {}
for woord, patroon in woordpatroon_dict.items():
if patroon in patroonwoorden_dict:
patroonwoorden_dict[patroon].add(woord)
else:
patroonwoorden_dict[patroon] = {woord}
return patroonwoorden_dict
I have found my mistake, apparently python isn't (yet) smart enough to keep the name patroon as function and the 'patroon' as variable in my for loop separately (which I later use in the same function 'groepen')