I have took the azure datasets that are available for practice. I got the 10 days data from that dataset and now I want to save this data into DBFS in csv format. I have facing an error :
" No such file or directory: '/dbfs/temp/hive/mytest.csv'"
but on the other hand if I am able to access the path directly from DBFS. This path is correct.
My code :
from azureml.opendatasets import NoaaIsdWeather
from datetime import datetime
from dateutil import parser
from dateutil.relativedelta import relativedelta
spark.sql('DROP Table if exists mytest')
dbutils.fs.rm("dbfs:/tmp/hive",recurse = True)
basepath = "dbfs:/tmp/hive"
try:
dbutils.fs.ls(basepath)
except:
dbutils.fs.mkdirs(basepath)
else:
raise Exception("The Folder "+ basepath + " already exist, this notebook will remove in the end")
dbutils.fs.mkdirs("dbfs:/tmp/hive")
start_date = parser.parse('2020-5-1')
end_date = parser.parse('2020-5-10')
isd = NoaaIsdWeather(start_date, end_date)
pdf = isd.to_spark_dataframe().toPandas().to_csv("/dbfs/temp/hive/mytest.csv")
What should I do ?
Thanks
I tried reproducing the same issue. First I have used the following code and made sure that the directory exists using os.listdir().
from azureml.opendatasets import NoaaIsdWeather
from datetime import datetime
from dateutil import parser
from dateutil.relativedelta import relativedelta
spark.sql('DROP Table if exists mytest')
dbutils.fs.rm("dbfs:/tmp/hive",recurse = True)
basepath = "dbfs:/tmp/hive"
try:
dbutils.fs.ls(basepath)
except:
dbutils.fs.mkdirs(basepath)
else:
raise Exception("The Folder "+ basepath + " already exist, this notebook will remove in the end")
dbutils.fs.mkdirs("dbfs:/tmp/hive")
import os
os.listdir("/dbfs/tmp/hive/")
Then I used the following to write the csv using to_pandas_dataframe(). This has successfully written the required dataframe to csv file in required path.
mydf = isd.to_pandas_dataframe()
mydf.to_csv("/dbfs/tmp/hive/mytest.csv")
Related
Sorry I had wrote a similar question elsewhere but I have forgotten its email Id hence writing back again.
I am trying to write a code where I have to read multiple docx files and extract a particular table from each of the docx file and create dataframes for each of them. my code goes like this:
import pandas as pd
from docx import Document
import os
directory = 'C:\\folder1\\Folder2\\Folder3\\Folder4'
for root, dirs, files in os.walk(directory):
for document in files:
if document.endswith('.docx'):
table_num=4
nheader=2
table =document.tables[table_num-1]
data=[[cell.text for cell in row.cells] for row in table.rows]
df = pd.DataFrame(data)
df='df_' + document
outside_col,inside_col =df.iloc[0],df.iloc[1]
hier_index =pd.MultiIndex.from_tuples(list(zip(outside_col,inside_col)))
df=pd.DataFrame(data,columns=hier_index).drop(df.index[[0,1]]).reset_index(drop=True)
I am getting the error AttributeError: 'str' object has no attribute 'tables'
but I was able to run the same thing for single file as a function.
import pandas as pd
from docx import Document
import os
directory = 'C:\\folder1\\Folder2\\Folder3\\Folder4'
def read_docx_table(document,table_num=1,nheader=1):
table =document.tables[table_num-1]
data=[[cell.text for cell in row.cells] for row in table.rows]
df = pd.DataFrame(data)
if nheader ==1:
df=df.rename(columns=df.iloc[0]).drop(df.index[0]).reset_index(drop=True)
elif nheader == 2:
outside_col,inside_col =df.iloc[0],df.iloc[1]
hier_index =pd.MultiIndex.from_tuples(list(zip(outside_col,inside_col)))
df=pd.DataFrame(data,columns=hier_index).drop(df.index[[0,1]]).reset_index(drop=True)
elif nheader>2:
print("Not Working")
df.DataFrame()
return df
document = Document("file1.docx")
table_num=2
nheader=0
df = read_docx_table(document,table_num,nheader)
print(df)
got help and changed accordingly
problem was document was a string before now its a document object
import pandas as pd
from docx import Document
import os
directory = 'path'
for root, dirs, files in os.walk(directory):
for document in files:
if document.endswith('.docx'):
table_num=4
nheader=2
joined_path = os.path.join(directory,document)
document1 = Document(joined_path)
table =document1.tables[table_num-1]
data=[[cell.text for cell in row.cells] for row in table.rows]
df = pd.DataFrame(data)
outside_col,inside_col =df.iloc[0],df.iloc[1]
hier_index =pd.MultiIndex.from_tuples(list(zip(outside_col,inside_col)))
df1=pd.DataFrame(data,columns=hier_index).drop(df.index[[0,1]]).reset_index(drop=True)
I am trying to read from excel and load into Mongodb using Pymongo.
The Error I got cannot is "encode object: , of type: <class 'pandas._libs.missing.NAType'>", when researched, I was told to use utf-8-sign format to insert it into monogodb, but in pandas dataframe there is no option to use utf-8
from pymongo import MongoClient
from datetime import datetime
import pandas as pd
import Parameters
import pandasql as pf
import json
import pymongo
import xlrd
from pathlib import Path
import os
import constants
try:
class conn:
def __init__(self):
client = pymongo.MongoClient( "mongodb://" + constants.USER_NAME + ":" + constants.PWD + constants.server + constants.CA_CERTIFICATES_PATH)
db = client[Parameters.STG_QC_Hub_Files]
week="08-02-2021"
out_col = db[Parameters.col]
filename = "1.xlsx"
path1 = Path('//test3'+'/'+filename)
data_load_date = datetime.today().strftime('%m-%d-%Y')
df1=pd.read_excel(path1,sheet_name="AU-ARCM Details",keep_default_na=False)
# df1 = pd.read_excel(xls+filename,keep_default_na=False,encoding='utf-8-sig')
# df1 = pd.read_csv(xls,keep_default_na=False,encoding='utf-8-sig').iloc[:, : 86]
df1["Week"]=week
df1["Data Load Date"]=data_load_date
df1 = df1.astype('string')
# df1.index = df1.index.str.encode('utf-8')
df1=df1.drop(['Source.Name'], axis=1)
records = json.loads(df1.T.to_json()).values()
out_col.insert_many(df1.to_dict('records'))
print("Imported File " +str(filename)+" with " +str(len(records) )+ " records")
c = conn()
except Exception as e:
print(e)
Traceback:
File "C:\Users\PycharmProjects\ReMs\venv\lib\site-packages\pymongo\message.py", line 1323, in _do_batched_op_msg
operation, command, docs, check_keys, ack, opts, ctx)
bson.errors.InvalidDocument: cannot encode object: <NA>, of type: <class 'pandas._libs.missing.NAType'>
You have some blank cells in your spreadsheet that pandas has its own type (NAT) for; pymongo doesn't know what to do with this type, hence the error. You will need to remove any of these in order to load the values into mongodb using the method you are using.
Consider something like this just before you attempt the insert:
import numpy as np
df1 = df1.replace(np.nan, None)
In this program i am not using request or beautiful soup function. I'm instead only using the datetime to extract the URLs. Now in the current program, I have written to extract the values for a long period. I want to make it in such a way that, if I automate this program and it runs today, it will extract yesterday's data. Similarly if it runs tomorrow, it will extract todays data and so on.
here is the code,
import datetime
from datetime import date, datetime,timedelta
import warnings
import datetime
import pandas as pd
import wget
import glob
import os
warnings.filterwarnings("ignore")
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from urllib.error import HTTPError
def date_range(start_date,end_date):
for n in range(int((end_date-start_date).days)):
yield start_date + timedelta(n)
def get_urls(base_url):
part_two = "/dailyCoal1-"
end_part = ".xlsx"
start_date = date(2020,11,1)
end_date = datetime.datetime.now().date()
start_urls = list()
for single_date in date_range(start_date, end_date):
start_urls.append(single_date.strftime(base_url+'%d-%m-%Y'+part_two+'%Y-%m-%d'+end_part))
return start_urls
def excel_download(link,out):
#downloads a given link provided to a output directory in out
wget.download(link,out)
if __name__ =="__main__":
base_url = "https://npp.gov.in/public-reports/cea/daily/fuel/"
mypath = "/Users/vp/Desktop/temp"
temp_folder = '/Users/vp/Desktop/temp'
out_folder = "/Users/vp/Desktop/NPP"
log_file = os.path.join(out_folder,'debug_log_npp.log')
out_file = os.path.join(out_folder,'Energy_inputs_npp.csv')
file_links = get_urls(base_url)
for link in file_links:
try:
excel_download(link,temp_folder)
except HTTPError:
content = "HTTP issue while capturing data for this link - " + link
log_writer(log_file,content)
continue
file = glob.glob(os.path.join(temp_folder,'*.xlsx'),recursive=True)[0]
df = pd.read_excel(file)
To capture yesterday's data, i created this in the main function where i check for yesterday = and then cancel if it isnt yesterday. But then its throwing error as it constantly picks the start date as its day one.
if(date_time_obj != Yesterday):
os.remove(file)
content = "Date mis-matched - " + str(date_time_obj) + " " + str(Yesterday)
In this program, date_time_obj - is the date it is currently trying to extract data for.
Everyday if this program runs at 8pm, it needs to only capture one day before data on a daily basis.
if this cannot be done in datetime, but only on request or bs4, then how do i approach this problem?
I don't know if you wanted a valid link as your code doesn't seem to produce those for me but you only need to tweak to work off start_date only and return a single item to return yesterday's link matching with your current output for same date.
import datetime
from datetime import date, datetime,timedelta
import warnings
import datetime
import pandas as pd
import glob
import os
warnings.filterwarnings("ignore")
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from urllib.error import HTTPError
def get_url(base_url):
part_two = "/dailyCoal1-"
end_part = ".xlsx"
start_date = datetime.datetime.now().date() + timedelta(-1)
start_url = start_date.strftime(base_url+'%d-%m-%Y'+part_two+'%Y-%m-%d'+end_part)
return start_url
def excel_download(link,out):
#downloads a given link provided to a output directory in out
wget.download(link,out)
if __name__ =="__main__":
base_url = "https://npp.gov.in/public-reports/cea/daily/fuel/"
mypath = "/Users/vp/Desktop/temp"
temp_folder = '/Users/vp/Desktop/temp'
out_folder = "/Users/vp/Desktop/NPP"
log_file = os.path.join(out_folder,'debug_log_npp.log')
out_file = os.path.join(out_folder,'Energy_inputs_npp.csv')
file_link = get_url(base_url)
print(file_link)
I need to dynamically save my pandas data frame. I have successfully managed to output a CSV file with a static name using the following code:
export_csv = df.to_csv(r'static_name.csv', header=False , index=False)
But I have failed to make this work dynamically. With the code below, I expect to get a file with the name passed into args.save_file_name and .csv suffix. However, I get no result.
import os
import argparse
import pandas as pd
parser = argparse.ArgumentParser()
parser.add_argument('file', help="this is the file you want to open")
parser.add_argument('save_file_name', help="the name of the file you want for the output CSV")
args = parser.parse_args()
print("file name:", args.file) # checking that this worked
...
# export csv
path = os.getcwd()
export_path = path + args.save_file_name + '.csv'
export_csv = df.to_csv(path_or_buf=export_path, header=False, index=False)
I think, that problem is in your export_path variable, that is not made right. The following code should do the job.
export_path = os.path.join(path, args.save_file_name + '.csv')
I'm trying to read the .db file in python code, whereas i getting "no table found an" error. But i could see the table when I import it onto MYSQL DB.
import sqlite3;
import pandas as pd;
con=None
def getConnection():
databaseFile="test.db"
global con
if con == None:
con=sqlite3.connect(databaseFile)
return con
def queryExec():
con=getConnection()
result=pd.read_sql_query("select * from Movie;",con)
result
queryExec()
Even I tried using the absolute path of the .db file, but no luck.
Assume you're trying to read data from SQLite database file, here is a simpler way to do it.
import sqlite3
import pandas as pd
con = sqlite3.connect("test.db")
with con:
df = pd.read_sql("select * from Movie", con)
print(df)