I have to take the first line of the csv file and after having processed it, delete it, and then resume the line after.
I'm trying to build a login system that takes the accounts from the csv file, and logs in one by one.
the problem is that every time you start the loop it always takes the same account, how can I fix it?
import pandas as pd
import pyperclip
import selenium
import random
from selenium import webdriver
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.common.keys import Keys
import names
df = pd.read_csv('/Users/giuseppeleonardi/Downloads/scraping2.csv')
def instagram_login():
df2=df.at[0,'ID'] #Find the first row id
pyperclip.copy(df2) #Copy the first row id to the clipboard
print(pyperclip.paste()) #Print the first row id
#apro il sito
driver.get('https://www.instagram.com/')
driver.maximize_window() #schermo intero
time.sleep(2)
try:
consent= driver.find_element(By.XPATH,"/html/body/div[2]/div/div/div/div[2]/div/div/div[1]/div/div[2]/div/div/div/div/div[2]/div/button[2]").click() #clicco il consenso
except:
pass
time.sleep(5)
put_username = driver.find_element(By.NAME,("username")).send_keys(pyperclip.paste()) #inserisco username
df2=df.at[0,'PASSWORD'] #find the password
pyperclip.copy(df2) #copy the password
put_password = driver.find_element(By.NAME,("password")).send_keys(pyperclip.paste()) #inserisco password
time.sleep(2)
login = driver.find_element(By.XPATH,"//div[contains(text(),'Accedi')]").click() #Click login
time.sleep(6)
#here is where the first row got deleted and saved on csv
df= pd.read_csv('/Users/giuseppeleonardi/Downloads/scraping2.csv').drop(0, axis=0)
df.to_csv(r'/Users/giuseppeleonardi/Downloads/scraping2.csv', index=False)
#this is the loop that always takes the same line of the file every time even though this is canceled at the end of the operation:
for line in len(df):
instagram_login()
time.sleep(5)
driver.delete_all_cookies()
i've googled a lot but cannot figure it out, i've read that file handle will read the file once, i need the loop for reset the list everytime and take the first value, how can i do it?
sorry but i'm still learning
Google for local and global variables. You are changing df inside a function. This does not change the 'global' df. You either need to return your df from the function or declare it first as a global variable.
First option:
df = pd.read_csv('/Users/giuseppeleonardi/Downloads/scraping2.csv')
def instagram_login():
df2=df.at[0,'ID'] #Find the first row id
.....
#here is where the first row got deleted and saved on csv
df= pd.read_csv('/Users/giuseppeleonardi/Downloads/scraping2.csv').drop(0, axis=0)
df.to_csv(r'/Users/giuseppeleonardi/Downloads/scraping2.csv', index=False)
return df
for line in len(df):
df = instagram_login()
time.sleep(5)
driver.delete_all_cookies()
Second option:
df = pd.read_csv('/Users/giuseppeleonardi/Downloads/scraping2.csv')
def instagram_login():
df2=df.at[0,'ID'] #Find the first row id
.....
#here is where the first row got deleted and saved on csv
global df
df = pd.read_csv('/Users/giuseppeleonardi/Downloads/scraping2.csv').drop(0, axis=0)
df.to_csv(r'/Users/giuseppeleonardi/Downloads/scraping2.csv', index=False)
for line in len(df):
instagram_login()
time.sleep(5)
driver.delete_all_cookies()
Your definition of df inside the function doesn't change the outside df.
So you can return the df and save it to outside df.
data_frame= pd.read_csv('/Users/giuseppeleonardi/Downloads/scraping2.csv')
def instagram_login(df):
......
#here is where the first row got deleted and saved on csv
df= pd.read_csv('/Users/giuseppeleonardi/Downloads/scraping2.csv').drop(0, axis=0)
df.to_csv(r'/Users/giuseppeleonardi/Downloads/scraping2.csv', index=False)
return df
#this is the loop that always takes the same line of the file every time even though this is canceled at the end of the operation:
for line in len(df):
data_frame = instagram_login(data_frame)
time.sleep(5)
driver.delete_all_cookies()
Related
I am having few hyperlinks(6 hyperlinks) in the ls List. I want to iterate over all the hyperlinks and for all the hyperlinks I want to iterate over whatever Xpaths(c,a,b,d mentioned below) are mentioned and the loop
should go on infinite. Below is my code.
import pandas as pd
from selenium import ebdriver
import time
driver=webdriver.Chrome(executable_path=".... ")
driver.get("hyerlink 1")
driver.maximize_window()
time.sleep(30)
df=pd.read_csv('..')
df.head
ls=df['column_name'].to_list()
for i in ls:
print(i)
driver.get(i)
i=0
while i<len(ls):
i+=1
c = driver.findElement(By.X_path, '/html/body/div[7]/div/div[11]/div[1]/div[2]/div[2]/div/div/div[1]/div/div/div/table/tbody/tr[1]/td/div/h3')
c.click()
time.sleep(30)
a = driver.findElement(By.X_path, '/html/body/div[7]/div/div[11]/div[1]/div[2]/div[2]/div/div/div[1]/div/div/div/table/tbody/tr[2]/td/div/div')
a.click()
time.sleep(30)
b=driver.findElement (By.X_path, '/html/body/div[7]/div/div[7]/div[1]/div/div/div')
b.click()
time.sleep(30) d=driver.findElement(By.X_path,'/html/body/div[7]/div/div[11]/div[1]/div[2]/div[2]/div/div/div[1]/div/div/div/div/div/div/div/div[2]/div ')
d.click()
time.sleep(30)
I have this function where the price of a stock gets logged in real time every 2 seconds and save it into a csv file however I cant see anything in the csv when I open it. What am I missing from the script?
import pandas as pd
import time
import urllib
import sys
import fix_yahoo_finance as yf
def stocks():
# Enter stock symbol
stock = input("Enter stock: ")
# Name CSV file
csvy= str(stock) + time.strftime('.%A.%d.%b.%Y').replace(' ', '') + ".csv"
csvy = csvy.replace(':' , '')
with open(csvy, 'w') as f:
sys.stdout = f
while 1 > 0:
print(yf.get_live_price(stock))
time.sleep(2)
stocks()
You wrote:
print(yf.get_live_price(stock))
You want to additionally flush the buffer so your new text is immediately visible:
print(yf.get_live_price(stock), flush=True)
Alternatively, consider assigning the live price to a temp variable,
and then outputting it twice, with print() and f.write(),
rather than assigning a new value to stdout.
Then you'd be able to flush them independently according to your need,
f.flush() or sys.stdout.flush().
Referring to my locusfile.py below:
from locust import HttpLocust, TaskSet, between, task
import csv
class UserBehavior(TaskSet):
#task(1)
def index(l):
with open ('topURL.csv') as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
for row in readCSV:
l.client.get("%s" % (row[0]))
class WebsiteUser(HttpLocust):
task_set = UserBehavior
wait_time = between(5.0, 9.0)
When I execute this script, Locust was able to run without any error. However, it'll loop through each row and load test only the latest URL. As it reads the next URL, the previous URL is no longer being load tested. What I want instead is for Locust to load test more and more URLs concurrently as it reads row by row from the CSV.
Edit
I managed to achieve partial concurrency by setting wait_time = between(0.0, 0.0)
Try filling an array with your csv data at setup and choosing randomly from it.
Like
def fill_array():
with open('topURL.csv') as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
for row in readCSV:
urls.append(row[0])
then
#task(1)
def index(l):
l.client.get("%s" % (random.choice(urls)))
more info at setup:
https://docs.locust.io/en/stable/writing-a-locustfile.html#setups-teardowns-on-start-and-on-stop
You could try something like:
global USER_CREDENTIALS
USER_CREDENTIALS = list(readCSV)
once done you will be able to refer each line for each virtual user/iteration
References:
Python import csv to list
How to Run Locust with Different Users
I am trying to make this web app to work but I am getting an error. these are the steps that web app is supposed to handle:
import a file
run the python script
export the results
when I run python script independently( without interfering with flask), it works fine( I use Jupyter notebook) on the other hand, when I run it with flask (from prompt) I get an error:
File "app.py", line 88, in <module>
for name, df in transformed_dict.items():
NameError: name 'transformed_dict' is not defined
Any idea of how can I make this web app to work?
This is my first time using flask and I will appreciate any suggestions or guidance.
python file & html file
from flask import Flask,render_template,request,send_file
from flask_sqlalchemy import SQLAlchemy
import os
import pandas as pd
from openpyxl import load_workbook
import sqlalchemy as db
def transform(df):
# Some data processing here
return df
app=Flask(__name__)
#app.route('/')
def index():
return render_template('firstpage.html')
#app.route('/upload',methods=['Get','POST'])
def upload():
file=request.files['inputfile']
xls=pd.ExcelFile(file)
name_dict = {}
snames = xls.sheet_names
for sn in snames:
name_dict[sn] = xls.parse(sn)
for key, value in name_dict.items():
transform(value)
transformed_dict={}
for key, value in name_dict.items():
transformed_dict[key]=transform(value)
#### wirte to excel example:
writer = pd.ExcelWriter("MyData.xlsx", engine='xlsxwriter')
for name, df in transformed_dict.items():
df.to_excel(writer, sheet_name=name)
writer.save()
if __name__=='__main__':
app.run(port=5000)
Your block:
#### wirte to excel example:
writer = pd.ExcelWriter("MyData.xlsx", engine='xlsxwriter')
for name, df in transformed_dict.items():
df.to_excel(writer, sheet_name=name)
writer.save()
should be part of your upload() function since that's where you define and fill transformed_dict. You just need to match the indentation there to the block above it.
The current error is coming up because it's trying to run that code as soon as you start your script, and transformed_dict doesn't exist at that point.
i'm trying to scrape the full HTML table from this site:
https://www.iscc-system.org/certificates/all-certificates/
My code is as follows:
from selenium import webdriver
import time
import pandas as pd
url = 'https://www.iscc-system.org/certificates/all-certificates/'
browser = webdriver.Chrome('/home/giuseppe/bin/chromedriver')
browser.get(url)
csvfile = open('Scrape_certificates', 'a')
dfs = pd.read_html('https://www.iscc-system.org/certificates/all-certificates/', header=0)
for i in range(1,10):
for df in dfs:
df.to_csv(csvfile, header=False)
link_next_page = browser.find_element_by_id('table_1_next')
link_next_page.click()
time.sleep(4)
dfs = pd.read_html(browser.current_url)
csvfile.close()
The above code is only for the first 10 pages of the full table as an example.
The problem is that the output is always the same first table repeated 10 times, although by clicking the 'next table' button the actual table gets updated (at least if I see the webpage), I'm unable to get the real new data from the following table. I get always the same data from the first table.
Firstly you are reading the URL with pandas not the page source. This will fetch the page new not read the Selenium generated source. Secondly you want to limit the reading to the table with the id = table_1. Try this:
from selenium import webdriver
import time
import pandas as pd
url = 'https://www.iscc-system.org/certificates/all-certificates/'
browser = webdriver.Chrome('/home/giuseppe/bin/chromedriver')
browser.get(url)
csvfile = open('Scrape_certificates', 'a')
for i in range(1,10):
dfs = pd.read_html(browser.page_source, attrs = {'id': 'table_1'})
for df in dfs:
df.to_csv(csvfile, header=False)
link_next_page = browser.find_element_by_id('table_1_next')
link_next_page.click()
time.sleep(4)
csvfile.close()
You will need to remove or filter out line 10 from each result as it is the navigation.