extract multiple URLs using the datetime function - python-3.x

In this program i am not using request or beautiful soup function. I'm instead only using the datetime to extract the URLs. Now in the current program, I have written to extract the values for a long period. I want to make it in such a way that, if I automate this program and it runs today, it will extract yesterday's data. Similarly if it runs tomorrow, it will extract todays data and so on.
here is the code,
import datetime
from datetime import date, datetime,timedelta
import warnings
import datetime
import pandas as pd
import wget
import glob
import os
warnings.filterwarnings("ignore")
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from urllib.error import HTTPError
def date_range(start_date,end_date):
for n in range(int((end_date-start_date).days)):
yield start_date + timedelta(n)
def get_urls(base_url):
part_two = "/dailyCoal1-"
end_part = ".xlsx"
start_date = date(2020,11,1)
end_date = datetime.datetime.now().date()
start_urls = list()
for single_date in date_range(start_date, end_date):
start_urls.append(single_date.strftime(base_url+'%d-%m-%Y'+part_two+'%Y-%m-%d'+end_part))
return start_urls
def excel_download(link,out):
#downloads a given link provided to a output directory in out
wget.download(link,out)
if __name__ =="__main__":
base_url = "https://npp.gov.in/public-reports/cea/daily/fuel/"
mypath = "/Users/vp/Desktop/temp"
temp_folder = '/Users/vp/Desktop/temp'
out_folder = "/Users/vp/Desktop/NPP"
log_file = os.path.join(out_folder,'debug_log_npp.log')
out_file = os.path.join(out_folder,'Energy_inputs_npp.csv')
file_links = get_urls(base_url)
for link in file_links:
try:
excel_download(link,temp_folder)
except HTTPError:
content = "HTTP issue while capturing data for this link - " + link
log_writer(log_file,content)
continue
file = glob.glob(os.path.join(temp_folder,'*.xlsx'),recursive=True)[0]
df = pd.read_excel(file)
To capture yesterday's data, i created this in the main function where i check for yesterday = and then cancel if it isnt yesterday. But then its throwing error as it constantly picks the start date as its day one.
if(date_time_obj != Yesterday):
os.remove(file)
content = "Date mis-matched - " + str(date_time_obj) + " " + str(Yesterday)
In this program, date_time_obj - is the date it is currently trying to extract data for.
Everyday if this program runs at 8pm, it needs to only capture one day before data on a daily basis.
if this cannot be done in datetime, but only on request or bs4, then how do i approach this problem?

I don't know if you wanted a valid link as your code doesn't seem to produce those for me but you only need to tweak to work off start_date only and return a single item to return yesterday's link matching with your current output for same date.
import datetime
from datetime import date, datetime,timedelta
import warnings
import datetime
import pandas as pd
import glob
import os
warnings.filterwarnings("ignore")
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from urllib.error import HTTPError
def get_url(base_url):
part_two = "/dailyCoal1-"
end_part = ".xlsx"
start_date = datetime.datetime.now().date() + timedelta(-1)
start_url = start_date.strftime(base_url+'%d-%m-%Y'+part_two+'%Y-%m-%d'+end_part)
return start_url
def excel_download(link,out):
#downloads a given link provided to a output directory in out
wget.download(link,out)
if __name__ =="__main__":
base_url = "https://npp.gov.in/public-reports/cea/daily/fuel/"
mypath = "/Users/vp/Desktop/temp"
temp_folder = '/Users/vp/Desktop/temp'
out_folder = "/Users/vp/Desktop/NPP"
log_file = os.path.join(out_folder,'debug_log_npp.log')
out_file = os.path.join(out_folder,'Energy_inputs_npp.csv')
file_link = get_url(base_url)
print(file_link)

Related

Matplotlib, Tkinter, Serial Multiprocessing

I created a code (using Tkinter, Python3 and matplotlid) that could read data from different serial ports, save them to csv, then create graphs and finally preview data in GUI. The code was splited in two different scripts. The main script contained reading data, save data to csv an priview of data and the other script contained the graph creation.
Today I rewrote the code using the answer of #user2464430 here. The code is working, but I can't update the GUI. Opens once and then no refresh with new data.
The following code is a part of total code.
My code is:
from PIL import ImageTk, Image
import tkinter as Tk
import multiprocessing
from queue import Empty, Full
from time import strftime
import serial
import numpy as np
import matplotlib.pyplot as plt
from drawnow import *
from pylab import *
import pandas as pd
from datetime import timedelta
from datetime import datetime
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import locale
import os
class GuiApp(object):
def __init__(self, image):
self.root = Tk.Tk()
self.root.resizable(width=False, height=False)
self.root.geometry("1600x800+0+0")
C = Canvas(self.root, bg="black", width=1600, height=800)
def BasicLabels():
....... # in this stage create multiple axis labels
Î¥AxisLabels()
BasicLabels()
def ValueLabels():
....... # Read and munipulate datas from CSV file and print in in labels
ValueLabels()
C.pack()
def GenerateData(q): #Read Serial Ports and store data to CSV file
file_exists = os.path.isfile("BigData.csv")
header = [["Daytime,T1"]]
if not file_exists:
with open("BigData.csv", "a+") as csvfile:
np.savetxt(csvfile, header, delimiter=",", fmt="%s", comments="")
while True:
try:
ser1 = serial.Serial(port="COM4", baudrate=9600)
read_ser1 = ser1.readline()
if read_ser1 == "":
read_ser1 = "Missing Value"
else:
read_ser1 = ser1.readline()
read_ser1 = str(read_ser1[0 : len(read_ser1)].decode("utf-8"))
# print("COM4:", read_ser1)
ser1.close()
except:
print("Failed 1")
read_ser1 = "9999,9999,9999,9999,9999"
daytime = strftime(" %d-%m-%Y %H:%M:%S")
rows = [
daytime
+ ","
+ read_ser1.strip()
]
with open("BigData.csv", "a+") as csvfile:
np.savetxt(csvfile, rows, delimiter=",", fmt="%s", comments="")
CreateGraphs()
def CreateGraphs():
#Code to generate graph. Called every time i have new line in CSV.
if __name__ == "__main__":
# Queue which will be used for storing Data
q = multiprocessing.Queue()
q.cancel_join_thread() # or else thread that puts data will not term
gui = GuiApp(q)
t1 = multiprocessing.Process(target=GenerateData, args=(q,))
t1.start()
gui.root.mainloop()
t1.join()
The graphs are generating after while True in GenerateData.
All datas for labels and graphs are coming from CSV file and not directly from serial port.
Is it possible to update GUI with latest datas from CSV and created graphs?
Thank for your time.

Unable to save file in DBFS

I have took the azure datasets that are available for practice. I got the 10 days data from that dataset and now I want to save this data into DBFS in csv format. I have facing an error :
" No such file or directory: '/dbfs/temp/hive/mytest.csv'"
but on the other hand if I am able to access the path directly from DBFS. This path is correct.
My code :
from azureml.opendatasets import NoaaIsdWeather
from datetime import datetime
from dateutil import parser
from dateutil.relativedelta import relativedelta
spark.sql('DROP Table if exists mytest')
dbutils.fs.rm("dbfs:/tmp/hive",recurse = True)
basepath = "dbfs:/tmp/hive"
try:
dbutils.fs.ls(basepath)
except:
dbutils.fs.mkdirs(basepath)
else:
raise Exception("The Folder "+ basepath + " already exist, this notebook will remove in the end")
dbutils.fs.mkdirs("dbfs:/tmp/hive")
start_date = parser.parse('2020-5-1')
end_date = parser.parse('2020-5-10')
isd = NoaaIsdWeather(start_date, end_date)
pdf = isd.to_spark_dataframe().toPandas().to_csv("/dbfs/temp/hive/mytest.csv")
What should I do ?
Thanks
I tried reproducing the same issue. First I have used the following code and made sure that the directory exists using os.listdir().
from azureml.opendatasets import NoaaIsdWeather
from datetime import datetime
from dateutil import parser
from dateutil.relativedelta import relativedelta
spark.sql('DROP Table if exists mytest')
dbutils.fs.rm("dbfs:/tmp/hive",recurse = True)
basepath = "dbfs:/tmp/hive"
try:
dbutils.fs.ls(basepath)
except:
dbutils.fs.mkdirs(basepath)
else:
raise Exception("The Folder "+ basepath + " already exist, this notebook will remove in the end")
dbutils.fs.mkdirs("dbfs:/tmp/hive")
import os
os.listdir("/dbfs/tmp/hive/")
Then I used the following to write the csv using to_pandas_dataframe(). This has successfully written the required dataframe to csv file in required path.
mydf = isd.to_pandas_dataframe()
mydf.to_csv("/dbfs/tmp/hive/mytest.csv")

Extracting links from a URL until there is data and then moving to the next URL

I have a URL from where I am trying to extract data. I have found a way to extract data from that url. but how do i then move to the next URL if the existing URL doesn't have any data?
The base URL have used in main function is,
https://posoco.in/reports/daily-reports/
instead I only want to extract data from 2020-21 and then go on from here,
https://posoco.in/reports/daily-reports/daily-reports-2020-21/
this way if the pdf files from 2020-21 is all extracted, then the program should start extracting from the next URL which is 2021-22 so on as long as the website exists in an automated way so program can check them automated every-year.
The code I have written.
#import libraries
import re
import tabula
import datetime
from datetime import datetime,timedelta
from datetime import timedelta, date
import requests
import pandas as pd
import glob
import logging
import os
import urllib.request
import urllib.error
import urllib.parse
from bs4 import BeautifulSoup
import wget
def source_urls(url):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
filelink = []
for link in tags:
if ('daily-reports-' in link.get('href','')):
filelink.append(link.get('href'))
return filelink
def get_urls(url):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
filelink = []
for link in tags:
if ('_nldc_psp/?wpdmdl' in link.get('href','')):
filelink.append(link.get('href'))
return filelink
if __name__ == "__main__":
url = 'https://posoco.in/reports/daily-reports/'
file_links = source_urls(url)
sorted_file_links = sorted(file_links)
for files in sorted_file_links:
sub_files = get_urls(files)
for x in sub_files:
print(x)
The program output
https://posoco.in/download/31.03.14_nldc_psp/?wpdmdl=3256
https://posoco.in/download/30.03.14_nldc_psp/?wpdmdl=3255
https://posoco.in/download/29.03.14_nldc_psp/?wpdmdl=3254
https://posoco.in/download/28.03.14_nldc_psp/?wpdmdl=3253
....
...
...
...
...
https://posoco.in/download/11-03-21_nldc_psp/?wpdmdl=35681
https://posoco.in/download/10-03-21_nldc_psp/?wpdmdl=35649
https://posoco.in/download/09-03-21_nldc_psp/?wpdmdl=35627
https://posoco.in/download/08-03-21_nldc_psp/?wpdmdl=35612
https://posoco.in/download/07-03-21_nldc_psp/?wpdmdl=35589
I have pasted all the libraries, but only a few are used here. rest are used to download and for processing and logging.
Well, you've to set an condition where it's will pickup the main urls which is equal/bigger than 2021-20
And then you can parse the inner urls.
Also, There's no need to use .get here!
Since you are using a condition to pickup urls include _nldc_psp/?wpdmdlwhich means if there's a pattern then return it. so you DONNOT need to replace the value with empty values link.get('href','') and then return it!
import httpx
import trio
from bs4 import BeautifulSoup
from datetime import datetime
from pprint import pprint as pp
cond = datetime.strptime('2020-21', '%Y-%d')
async def get_urls(client):
r = await client.get('https://posoco.in/reports/daily-reports/')
soup = BeautifulSoup(r.text, 'lxml')
return [x['href'] for x in soup.select('a[href*=reports-]') if datetime.strptime(x['href'].split('-', 3)[-1][:-1], '%Y-%d') >= cond]
async def main():
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
links = await get_urls(client)
async def get_pdf(url):
r = await client.get(url)
soup = BeautifulSoup(r.text, 'lxml')
pp([x['href']
for x in soup.select('a[href*="nldc_psp"]')])
if links:
for link in links:
nurse.start_soon(get_pdf, link)
if __name__ == "__main__":
trio.run(main)

How to get the curent date with python

I have made a web-scraper with python 3 and bs4. I want the current date so that i can use it as a file name for the scraped website.
Here is my code:
import bs4
import requests
import sys
import re
import unicodedata
import os
filename = #Current date#
filename=r"C:\Python\Scripts\Webscrapers\Output\\" +filename+ ".txt"
url = "https://www.wikipedia.org/Example_Article/"
res = requests.get(url)
soup = bs4.BeautifulSoup(res.text, "lxml")
file = open(filename , 'wb')
for i in soup.select("p"):
f=i.text
file.write(unicodedata.normalize('NFD', re.sub("[\(\[].*?[\)\]]", "", f)).encode('ascii', 'ignore'))
file.write(unicodedata.normalize('NFD', re.sub("[\(\[].*?[\)\]]", "", os.linesep)).encode('ascii', 'ignore'))
file.write(unicodedata.normalize('NFD', re.sub("[\(\[].*?[\)\]]", "", os.linesep)).encode('ascii', 'ignore'))
file.close()
After hours of googling i came up with this:
>>> import datetime
>>> print (datetime.datetime.today())
2020-05-14 11:49:55.695210
>>>
But,
I want something like this: 14-May-2020
Is it possible if so, then please help me
I just want to know the current date as a string
Use the strftime function from the time module:
import time
time.strftime("%d-%B-%Y", time.localtime())
'14-May-2020'

How can I make the phantomJS webdriver to wait until a specific HTML element being loaded and then return the page.source?

I have developed the code below for a web crawling object.
It takes two dates as inputs.Then creates a list of dates between these two dates and attach each one to a webpage url which contains weather information of a location. Then it converts HTML tables of data into Dataframe and after that stores data as csv file in storage (the base link is: https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/2019-1-3 and as you can see in this example the date is 2019-1-3):
from datetime import timedelta, date
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
from furl import furl
import os
import time
class WebCrawler():
def __init__(self, st_date, end_date):
if not os.path.exists('Data'):
os.makedirs('Data')
self.path = os.path.join(os.getcwd(), 'Data')
self.driver = webdriver.PhantomJS()
self.base_url = 'https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/'
self.st_date = st_date
self.end_date = end_date
def date_list(self):
# Create list of dates between two dates given as inputs.
dates = []
total_days = int((self.end_date - self.st_date).days + 1)
for i in range(total_days):
date = self.st_date + timedelta(days=i)
dates.append(date.strftime('%Y-%m-%d'))
return dates
def create_link(self, attachment):
# Attach dates to base link
f = furl(self.base_url)
f.path /= attachment
f.path.normalize()
return f.url
def open_link(self, link):
# Opens link and visits page and returns html source code of page
self.driver.get(link)
html = self.driver.page_source
return html
def table_to_df(self, html):
# Finds table of weather data and converts it into pandas dataframe and returns it
soup = BeautifulSoup(html, 'lxml')
table = soup.find("table",{"class":"tablesaw-sortable"})
dfs = pd.read_html(str(table))
df = dfs[0]
return df
def to_csv(self, name, df):
# Save the dataframe as csv file in the defined path
filename = name + '.csv'
df.to_csv(os.path.join(self.path,filename), index=False)
This is the way I want to use the WebCrawler object:
date1 = date(2018, 12, 29)
date2 = date(2019, 1, 1)
# Initialize WebCrawler object
crawler = WebCrawler(st_date=date1, end_date=date2)
dates = crawler.date_list()
for day in dates:
print('**************************')
print('PROCESSING : ', day)
link = crawler.create_link(day)
print('WAITING... ')
time.sleep(3)
print('VISIT WEBPAGE ... ')
html = crawler.open_link(link)
print('DATA RETRIEVED ... ')
df = crawler.table_to_df(html)
print(df.head(3))
crawler.to_csv(day, df)
print('DATA SAVED ...')
The problem which occurs is that the first iteration of loop runs perfect but the second one stops with an error which says No tables where found (occurs in table = soup.find("table",{"class":"tablesaw-sortable"}) line) and that's because page source is returned by WebCrawler.open_link before the webpage fully load the contents of webpage including the table (containing weather information). there is also a probability that website rejects the request because it's making the servers too busy.
Is there anyway that we could build a loop that keep trying to open the link until when it could find the table, or at least wait until table is loaded and then return the table?
You can have selenium wait for a specific element. In your case it will be the table with the class name of "tablesaw-sortable". I highly recommend that you use CSS selectors to find this element, as it's fast and less error prone that getting all table elements.
Here is the CSS selector, premade for you table.tablesaw-sortable. Set selenium to wait until that element has loaded.
Source: https://stackoverflow.com/a/26567563/4159473
I rewrote the code using the https://stackoverflow.com/a/26567563/4159473 solution which was suggested by #mildmelon and I also used some delays between each time sending request to server and asking for the page source:
from datetime import timedelta, date
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import pandas as pd
from furl import furl
import os
import time
class WebCrawler():
def __init__(self, st_date, end_date):
if not os.path.exists('Data'):
os.makedirs('Data')
self.path = os.path.join(os.getcwd(), 'Data')
self.driver = webdriver.PhantomJS()
self.delay_for_page = 7
self.base_url = 'https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/'
self.st_date = st_date
self.end_date = end_date
def date_list(self):
# Create list of dates between two dates given as inputs.
dates = []
total_days = int((self.end_date - self.st_date).days + 1)
for i in range(total_days):
date = self.st_date + timedelta(days=i)
dates.append(date.strftime('%Y-%m-%d'))
return dates
def create_link(self, attachment):
# Attach dates to base link
f = furl(self.base_url)
f.path /= attachment
f.path.normalize()
return f.url
def open_link(self, link):
# Opens link and visits page and returns html source code of page
self.driver.get(link)
myElem = WebDriverWait(self.driver, self.delay_for_page)\
.until(EC.presence_of_element_located((By.CLASS_NAME, 'tablesaw-sortable')))
def table_to_df(self, html):
# Finds table of weather data and converts it into pandas dataframe and returns it
soup = BeautifulSoup(html, 'lxml')
table = soup.find("table",{"class":"tablesaw-sortable"})
dfs = pd.read_html(str(table))
df = dfs[0]
return df
def to_csv(self, name, df):
# Save the dataframe as csv file in the defined path
filename = name + '.csv'
df.to_csv(os.path.join(self.path,filename), index=False)
date1 = date(2019, 2, 1)
date2 = date(2019, 3, 5)
# Initialize WebCrawler object
crawler = WebCrawler(st_date=date1, end_date=date2)
dates = crawler.date_list()
for day in few_dates:
print('**************************')
print('DATE : ', day)
link = crawler.create_link(day)
print('WAITING ....')
print('')
time.sleep(12)
print('OPENING LINK ... ')
try:
crawler.open_link(link)
html = crawler.driver.page_source
print( "DATA IS FETCHED")
df = crawler.table_to_df(html)
print(df.head(3))
crawler.to_csv(day, df)
print('DATA SAVED ...')
except TimeoutException:
print( "NOT FETCHED ...!!!")
The weather information is fetched without problem. I guess delays between each request resulted in better performance. The line myElem = WebDriverWait(self.driver, self.delay_for_page)\.until(EC.presence_of_element_located((By.CLASS_NAME, 'tablesaw-sortable'))) has also improved speed.

Resources