Facing issues with my twitter scraper written using python and selenium - python-3.x

I've written a script in python to parse the name, tweets, following and follower of those available in view all section in my profile page of twitter. It is currently doing it's job. However, I find two problems with this scraper:
Every pages it parses the documents from are jamming on the taskbar.
The scraper has got a clumsy look.
Here is what I've written:
from selenium import webdriver
import time
def twitter_data():
driver = webdriver.Chrome()
driver.get('https://twitter.com/?lang=en')
driver.find_element_by_xpath('//input[#id="signin-email"]').send_keys('username')
driver.find_element_by_xpath('//input[#id="signin-password"]').send_keys('password')
driver.find_element_by_xpath('//button[#type="submit"]').click()
driver.implicitly_wait(15)
#Clicking the viewall link
driver.find_element_by_xpath("//small[#class='view-all']//a[contains(#class,'js-view-all-link')]").click()
time.sleep(10)
for links in driver.find_elements_by_xpath("//div[#class='stream-item-header']//a[contains(#class,'js-user-profile-link')]"):
processing_files(links.get_attribute("href"))
#going on to the each profile falling under viewall section
def processing_files(item_link):
driver = webdriver.Chrome()
driver.get(item_link)
# getting information of each profile holder
for prof in driver.find_elements_by_xpath("//div[#class='route-profile']"):
name = prof.find_elements_by_xpath(".//h1[#class='ProfileHeaderCard-name']//a[contains(#class,'ProfileHeaderCard-nameLink')]")[0]
tweet = prof.find_elements_by_xpath(".//span[#class='ProfileNav-value']")[0]
following = prof.find_elements_by_xpath(".//span[#class='ProfileNav-value']")[1]
follower = prof.find_elements_by_xpath(".//span[#class='ProfileNav-value']")[2]
print(name.text, tweet.text, following.text, follower.text)
twitter_data()
I've used both the implicitly_wait and time.sleep in my scraper cause when i found that it was necessary to keep the bot wait a bit longer I used the latter. Thanks in advance to take a look into it.

You can use driver.quit() to close the pages as given below. This will reduce pages in the task bar.
from selenium import webdriver
import time
def twitter_data():
driver = webdriver.Chrome()
driver.get('https://twitter.com/?lang=en')
driver.find_element_by_xpath('//input[#id="signin-email"]').send_keys('username')
driver.find_element_by_xpath('//input[#id="signin-password"]').send_keys('password')
driver.find_element_by_xpath('//button[#type="submit"]').click()
driver.implicitly_wait(15)
#Clicking the viewall link
driver.find_element_by_xpath("//small[#class='view-all']//a[contains(#class,'js-view-all-link')]").click()
time.sleep(10)
for links in driver.find_elements_by_xpath("//div[#class='stream-item-header']//a[contains(#class,'js-user-profile-link')]"):
processing_files(links.get_attribute("href"))
driver.quit()
#going on to the each profile falling under viewall section
def processing_files(item_link):
driver1 = webdriver.Chrome()
driver1.get(item_link)
# getting information of each profile holder
for prof in driver1.find_elements_by_xpath("//div[#class='route-profile']"):
name = prof.find_elements_by_xpath(".//h1[#class='ProfileHeaderCard-name']//a[contains(#class,'ProfileHeaderCard-nameLink')]")[0]
tweet = prof.find_elements_by_xpath(".//span[#class='ProfileNav-value']")[0]
following = prof.find_elements_by_xpath(".//span[#class='ProfileNav-value']")[1]
follower = prof.find_elements_by_xpath(".//span[#class='ProfileNav-value']")[2]
print(name.text, tweet.text, following.text, follower.text)
driver1.quit ()
twitter_data()

Related

I'm writing a code where I can enter Instagram and get the follower list

Hello ı'm a beginner coder
I want to login to instagram using selenium and get my follower list but my code But the code I wrote does not give any error or output.
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
class İnstagram:
def __init__(self,name,password):
self.browser = webdriver.Firefox()
self.username = username
self.password = password
def signIn(self):
self.browser.get("https://www.instagram.com")
time.sleep(1)
name = self.browser.find_element(By.XPATH,"//*[#id='loginForm']/div[1]/div[1]/div/label/input").send_keys(self.username)
psw = self.browser.find_element(By.XPATH,"//*[#id='loginForm']/div[1]/div[2]/div/label/input").send_keys(self.password)
self.browser.find_element(By.XPATH,"//*[#id='loginForm']/div[1]/div[3]").click()
time.sleep(7)
def getFollowers(self):
self.browser.get("https://www.instagram.com/emirhaninmalikanesi/followers/")
followers = self.browser.find_elements(By.CSS_SELECTOR,"._ab8w._ab94._ab99._ab9h._ab9m._ab9o._abcm")
for user in followers:
name = user.find_element(By.CSS_SELECTOR,".x1i10hfl.xjbqb8w.x6umtig.x1b1mbwd.xaqea5y.xav7gou.x9f619.x1ypdohk.xt0psk2.xe8uvvx.xdj266r.x11i5rnm.xat24cr.x1mh8g0r.xexx8yu.x4uap5.x18d9i69.xkhd6sd.x16tdsg8.x1hl2dhg.xggy1nq.x1a2a7pz.notranslate._a6hd").get_attribute("href")
print(name)
instagram = İnstagram(username,password)
instagram.signIn()
instagram.getFollowers()
Your code has two flaws
Before login you have to close the cookies dialog, otherwise the code raises ElementClickInterceptedException when trying to click on "Log in".
No use of selenium built-in timeout methods (driver.implicitly_wait or WebDriverWait) to wait for an element to be found. This causes followers to be an empty list.
Corrected code
driver.implicitly_wait(9) # driver waits up to 9 seconds for an element to be found
driver.find_element(By.XPATH,"//div[#role='dialog']/div/button[contains(.,'cookie')]").click()
driver.find_element(By.XPATH,"//*[#id='loginForm']/div[1]/div[1]/div/label/input").send_keys(username)
driver.find_element(By.XPATH,"//*[#id='loginForm']/div[1]/div[2]/div/label/input").send_keys(password)
driver.find_element(By.XPATH,"//*[#id='loginForm']/div[1]/div[3]").click()
driver.get("https://www.instagram.com/emirhaninmalikanesi/followers/")
followers = driver.find_elements(By.CSS_SELECTOR,"._ab8w._ab94._ab99._ab9h._ab9m._ab9o._abcm")
for user in followers:
name = user.find_element(By.CSS_SELECTOR,".x1i10hfl.xjbqb8w.x6umtig.x1b1mbwd.xaqea5y.xav7gou.x9f619.x1ypdohk.xt0psk2.xe8uvvx.xdj266r.x11i5rnm.xat24cr.x1mh8g0r.xexx8yu.x4uap5.x18d9i69.xkhd6sd.x16tdsg8.x1hl2dhg.xggy1nq.x1a2a7pz.notranslate._a6hd").get_attribute("href")
print(name)
As a final consideration, notice that you can avoid the login process by loading in selenium a user profile where you are already logged in. For more info look here https://stackoverflow.com/a/75434260/8157304

Find the Twitter text box element with python selenium

I made my own Twitter complaint bot that tweets at my ISP if the network drops.
Code works perfect, until it has to find the Twitter textbox to type the tweet.
Main error is:
StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
I have tried:
Adding time delays
Using Firefox Driver instead of Google
Adding page refreshes before the tweet_at_provider() looks for the textbox
Clicking the "Tweet" button to bring up the textbox to then try type in it
Using find.element_by_id but twitter changes id every pageload
When I comment out the first function call to test, it will find and type 6/10 times.
But when both functions are called the tweet_at_provider() always fails at grabbing the textbox and I get the StaleElement error.
import selenium, time, pyautogui
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException, StaleElementReferenceException
PROMISED_DOWN = 200
PROMISED_UP = 10
CHROME_DRIVER_PATH = "C:\Development\chromedriver.exe"
GECKODRIVER_PATH = "C:\\Users\\meeha\\Desktop\\geckodriver\\geckodriver.exe"
TWITTER_USERNAME = "my_username"
TWITTER_PASSWORD = "my_password"
class InternetSpeedTwitterBot():
def __init__(self, driver_path):
self.driver = webdriver.Chrome(executable_path=driver_path)
self.down = 0
self.up = 0
def get_internet_speed(self):
self.driver.get("https://www.speedtest.net/")
self.driver.maximize_window()
time.sleep(2)
go = self.driver.find_element_by_xpath("//*[#id='container']/div/div[3]/div/div/div/div[2]/div[3]/div[1]/a/span[4]")
go.click()
time.sleep(40)
self.down = self.driver.find_element_by_xpath("//*[#id='container']/div/div[3]/div/div/div/div[2]/div[3]/div[3]/div/div[3]/div/div/div[2]/div[1]/div[2]/div/div[2]/span")
self.up = self.driver.find_element_by_xpath("//*[#id='container']/div/div[3]/div/div/div/div[2]/div[3]/div[3]/div/div[3]/div/div/div[2]/div[1]/div[3]/div/div[2]/span")
print(f"Download Speed: {self.down.text} Mbps")
print(f"Upload Speed: {self.up.text} Mbps")
time.sleep(3)
def tweet_at_provider(self):
self.driver.get("https://twitter.com/login")
self.driver.maximize_window()
time.sleep(3)
username = self.driver.find_element_by_name("session[username_or_email]")
password = self.driver.find_element_by_name("session[password]")
username.send_keys(TWITTER_USERNAME)
password.send_keys(TWITTER_PASSWORD)
password.submit()
time.sleep(5)
tweet_compose = self.driver.find_element_by_xpath('//*[#id="react-root"]/div/div/div[2]/header/div/div/div/div[1]/div[3]/a/div/span/div/div/span/span')
tweet_compose.click()
time.sleep(2)
textbox = self.driver.find_element_by_xpath('//*[#id="layers"]/div[2]/div/div/div/div/div/div[2]/div[2]/div/div[3]/div/div/div/div[1]/div/div/div/div/div[2]/div[1]/div/div/div/div/div/div/div/div/div/div[1]/div/div/div/div[2]/div/div/div/div')
textbox.send_keys(f"Hey #Ask_Spectrum, why is my internet speed {self.down.text} down / {self.up.text} up when I pay for {PROMISED_DOWN} down / {PROMISED_UP} up???")
bot = InternetSpeedTwitterBot(CHROME_DRIVER_PATH)
bot.get_internet_speed()
bot.tweet_at_provider()
I had the same error there and figured out that HTML tag was instantly changing as soon as I was typing something on the twitter text-box.
tackle this problem using XPATH of span tag that was showing up after typing space from my side. break tag is the initial tag when there is not any text prompted by you, only after you type anything turns into and that's when you have to copy XPATH and use it for your application

Web Scraping reviews -Flipkart

I am trying to take out entire review of a product(remaining half of the review is display after clicking read more. but I am still not able to do so.It is not displaying entire content of a review, which get dispalyed after clicking read more option. Below is the code , which click the readmore option and also get data from the website
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
response = requests.get("https://www.flipkart.com/poco-f1-graphite-black-64-gb/product-reviews/itmf8fyjyssnt25c?page=2&pid=MOBF85V7A6PXETAX")
data = BeautifulSoup(response.content, 'lxml')
chromepath = r"C:\Users\Mohammed\Downloads\chromedriver.exe"
driver=webdriver.Chrome(chromepath)
driver.get("https://www.flipkart.com/poco-f1-graphite-black-64-gb/product-reviews/itmf8fyjyssnt25c?page=2&pid=MOBF85V7A6PXETAX")
d = driver.find_element_by_class_name("_1EPkIx")
d.click()
title = data.find_all("p",{"class" : "_2xg6Ul"})
text1 = data.find_all("div",{"class" : "qwjRop"})
name = data.find_all("p",{"class" : "_3LYOAd _3sxSiS"})
for t2, t , t1 in zip(title,text1,name) :
print(t2.text,'\n',t.text,'\n',t1.text)
To get the full reviews, It is necessary to click on those READ MORE buttons to unwrap the rest. As you have already used selenium in combination with BeautifulSoup, I've modified the script to follow the logic. The script will first click on those READ MORE buttons. Once it is done, it will then parse all the titles and reviews from there. You can now get the titles and reviews from multiple pages (upto 4 pages).
import time
from bs4 import BeautifulSoup
from selenium import webdriver
link = "https://www.flipkart.com/poco-f1-graphite-black-64-gb/product-reviews/itmf8fyjyssnt25c?page={}&pid=MOBF85V7A6PXETAX"
driver = webdriver.Chrome() #If necessary, define the chrome path explicitly
for page_num in range(1,5):
driver.get(link.format(page_num))
[item.click() for item in driver.find_elements_by_class_name("_1EPkIx")]
time.sleep(1)
soup = BeautifulSoup(driver.page_source, 'lxml')
for items in soup.select("._3DCdKt"):
title = items.select_one("p._2xg6Ul").text
review = ' '.join(items.select_one(".qwjRop div:nth-of-type(2)").text.split())
print(f'{title}\n{review}\n')
driver.quit()

How to automate the crawling without hardcoding any number to it?

I've written a script using python with selenium to scrape names of restaurants from a webpage. It is working great if I hardcode the number of amount I want to parse. The page has got lazy-loading process and it displays 40 names in each scroll. However, my script can handle it. The only thing I would like to improve in my script is that I do not wish to hardcode the number; rather, I want it to detect itself how many are there and parse it successfully. Hope there is someone to help. Here is the code:
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get('https://www.yellowpages.ca/search/si/1/pizza/Toronto')
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
links = [posts.text for posts in driver.find_elements_by_xpath("//div[#itemprop='itemListElement']//h3[#itemprop='name']/a")]
if (len(links) == 240):
break
for link in links:
print(link)
driver.quit()
You can check if the number of links has changed in the last iteration
num_Of_links = -1
num = 0
while num != num_Of_links:
num_Of_links = num
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
links = [posts.text for posts in driver.find_elements_by_xpath("//div[#itemprop='itemListElement']//h3[#itemprop='name']/a")]
num = len(links)

Selenium web scraping in python cant read .text of elements

I am trying to scrap reviews from verizon website and I found the xpath of reviews by doing inspect on webpage. I am executing below code but this review.text doesnt seems to be working perfectly all the time. I get correct text sometimes and sometimes it just prints Error in message -
Not sure , what am I doing wrong..
from selenium import webdriver
url = 'https://www.verizonwireless.com/smartphones/samsung-galaxy-s7/'
browser = webdriver.Chrome(executable_path='/Users/userName/PycharmProjects/Verizon/chromedriver')
browser.get(url)
reviews = []
xp = '//*[#id="BVRRContainer"]/div/div/div/div/div[3]/div/ul/li[2]/a/span[2]'
# read first ten pages of reviews ==>
for j in range(10):
reviews.extend(browser.find_elements_by_xpath('//*[#id="BVRRContainer"]/div/div/div/div/ol/li[*]/div/div[1]'
'/div/div[2]/div/div/div[1]/p'))
try:
next = browser.find_element_by_xpath(xp)
next.click()
except:
print(j,"error clicking")
# Print reviews ===>
for i, review in enumerate(reviews):
try:
print(review.text)
except:
print("Error in :" review)
You should improve the logic of your code. Note, that you cannot get text of elements from the first page after redirection to next page- you need to get text before clicking "Next" button.
Try to use below code instead:
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
import time
url = 'https://www.verizonwireless.com/smartphones/samsung-galaxy-s7/'
browser = webdriver.Chrome()
browser.get(url)
reviews = []
xp = '//a[span[#class="bv-content-btn-pages-next"]]'
# read first ten pages of reviews ==>
for i in range(10):
for review in browser.find_elements_by_xpath('//div[#class="bv-content-summary-body-text"]/p'):
reviews.append(review.text)
try:
next = browser.find_element_by_xpath(xp)
next.location_once_scrolled_into_view
time.sleep(0.5) # To wait until scrolled down to "Next" button
next.click()
time.sleep(2) # To wait for page "autoscrolling" to first review + until modal window dissapeared
except WebDriverException:
print("error clicking")
for review in reviews:
print(review)

Resources