Unable to infer schema when loading file - python-3.x

below code is working file when running in pyspark shell, but its failing when execution in spark-submit master-yarn.
What wrong am I doing here?
from datetime import date, timedelta
import pandas as pd
import os, sys
startd = '20140101'
endd=str(sys.argv[1])
currd=str(sys.argv[2])
spark = SparkSession.builder.getOrCreate()
base = "s3://metadata_v1/DATE="
dstart = pd.to_datetime(startd).date()
dend = pd.to_datetime(endd).date()
s3 = []
days = [dstart + timedelta(days=x) for x in range((dend - dstart).days + 1) if (dstart + timedelta(days=x)).weekday() == 5]
for i in days:
s3.append(base + i.strftime('%Y-%m-%d'))
data = spark.read.option("header", "True").option("delimiter", "|").option("basePath","s3://metadata_v1/").csv(s3)
Error:
pyspark.sql.utils.AnalysisException: u'Unable to infer schema for CSV. It must be specified manually.;'

It could happen when some of your csv files contain header row, which some columns can't be loaded when trying to convert the data types for some columns. You can try to remove the header row in each CSV file before reading into dataframe.

Related

pandas datareader. Save all data to one dataframe

I am new to Python and I have trouble getting data into one dataframe.
I have the following code.
from pandas_datareader import data as pdr
from datetime import date
from datetime import timedelta
import yfinance as yf
yf.pdr_override()
import pandas as pd
# tickers list
ticker_list = ['0P0001A532.CO','0P00018Q4V.CO','0P00017UBI.CO','0P00000YYT.CO','PFIBAA.CO','PFIBAB.CO','PFIBAC.CO','PFIDKA.CO','PFIGLA.CO','PFIMLO.CO','PFIKRB.CO','0P00019SMI.F','WEKAFKI.CO','0P0001CICW.CO','WEISTA.CO','WEISTS.CO','WEISA.CO','WEITISOP.CO']
today = date.today()
# We can get data by our choice by days bracket
if date.today().weekday()==0:
start_date = (today + timedelta((4 + today.weekday()) % 7)) - timedelta(days=7) # Friday. If it is monday we do not have a price since it is based on the previous day close.
else:
start_date = today - timedelta(days=1)
files=[]
allData = []
dafr_All = []
def getData(ticker):
print(ticker)
data = pdr.get_data_yahoo(ticker, start= start_date, end=(today + timedelta(days=2)))['Adj Close']
dataname = ticker+'_'+str(today)
files.append(dataname)
allData.append(data)
SaveData(data, dataname)
# Create a data folder in your current dir.
def SaveData(df, filename):
df.to_csv('./data/'+filename+'.csv')
#This loop will iterate over ticker list, will pass one ticker to get data, and save that data as file.
for tik in ticker_list:
getData(tik)
for i in range(0,11):
df1= pd.read_csv('./data/'+ str(files[i])+'.csv')
print (df1.head())
I get several csv files containing the adjusted close values (if there exists an adjusted close).
I want to save all the data to a dataframe where the first column consist of tickers, while the second column consist of adjusted close values. The dataframe then needs to be exported into a csv-file.

Python function to iterate each unique column and transform using pyspark

I'm building the following global function in Pyspark to go through each column in my CSV that is in different formats and convert them all to one unique format separated by "-."
I am new to the python world, I am getting
TypeError: Column is not iterable
employeesDF =is reading csv file from local sys
I tried the below code:
def colrename(df):
for col in employeesDF.columns:
F.col(col).alias(col.replace('/s,#', '_'))
return employeesDF
ndf = colrename (employeesDF.columns)
Input:
OutPut:
This will work
import re
def colrename(column):
reg = re.sub(r'\s|#', '_',column)
return reg
df2 = df2.toDF(*(colrename(c) for c in df2.columns))
In case any one interested, I used the code below to do it. I hope this information is useful. Thanks
from pyspark.sql import *
import re
spark = SparkSession.builder.master("local").appName("test").getOrCreate()
df=spark.read.format('csv')\
.option('header',True)\
.option('inferschema',True)\
.load('C:\\bigdata\\datasets\\employee10000_records.csv')
def colrename(df):
for names in df.schema.names:
df = df.withColumnRenamed(names, re.sub(r'([^A-Za-z0-9])','_',names))
return df
colrename (df).show()

Reading multiple excel files into a pandas dataframe, but also storing the file name

I would like to read multiple excel files and store them into a single pandas dataframe, but I would like one of the columns in the dataframe to be the file name. This is because the file name contains the date (this is monthly data) and I need that information. I can't seem to get the filename, but I'm able to get the excel files into a dataframe. Please help.
import os
import pandas as pd
import fsspec
files = os.listdir("C://Users//6J2754897//Downloads//monthlydata")
paths = "C://Users//6J2754897//Downloads//monthlydata"
a = pd.DataFrame([2], index = None)
df = pd.DataFrame()
for file in range(len(files)):
if files[file].endswith('.xlsx'):
df = df.append(pd.read_excel(paths + "//" + files[file], sheet_name = "information", skiprows=7), ignore_index=True)
df['Month'] = str(files[file])
The order of operations here is incorrect. The line:
df['Month'] = str(files[file])
Is going to overwrite the entire column with the most recent value.
Instead we should only add the value to the current DataFrame:
import os
import pandas as pd
paths = "C://Users//6J2754897//Downloads//monthlydata"
files = os.listdir(paths)
df = pd.DataFrame()
for file in range(len(files)):
if files[file].endswith('.xlsx'):
# Read in File
file_df = pd.read_excel(paths + "//" + files[file],
sheet_name="information",
skiprows=7)
# Add to just this DataFrame
file_df['Month'] = str(files[file])
# Update `df`
df = df.append(file_df, ignore_index=True)
Alternatively we can use DataFrame.assign to chain the column assignment:
import os
import pandas as pd
paths = "C://Users//6J2754897//Downloads//monthlydata"
files = os.listdir(paths)
df = pd.DataFrame()
for file in range(len(files)):
if files[file].endswith('.xlsx'):
# Read in File
df = df.append(
# Read in File
pd.read_excel(paths + "//" + files[file],
sheet_name="information",
skiprows=7)
.assign(Month=str(files[file])), # Add to just this DataFrame
ignore_index=True
)
For general overall improvements we can use pd.concat with a list comprehension over files. This is done to avoid growing the DataFrame (which can be extremely slow). Pathlib.glob can also help with the ability to select the appropriate files:
from pathlib import Path
import pandas as pd
paths = "C://Users//6J2754897//Downloads//monthlydata"
df = pd.concat([
pd.read_excel(file,
sheet_name="information",
skiprows=7)
.assign(Month=file.stem) # We may also want file.name here
for file in Path(paths).glob('*.xlsx')
])
Some options for the Month Column are either:
file.stem will give "[t]he final path component, without its suffix".
'folder/folder/sample.xlsx' -> 'sample'
file.name will give "the final path component, excluding the drive and root".
'folder/folder/sample.xlsx' -> 'sample.xlsx'

Python: Identify invalid online link for a zip file

I am trying to automate stock price data extraction from https://www.nseindia.com/. Data is stored as a zip file and url for the zip file file varies by date. If on a certain date stock market is closed eg - weekends and holidays, there would be no file/url.
I want to identify invalid links (links that dont exist) and skip to next link.
This is a valid link -
path = 'https://archives.nseindia.com/content/historical/EQUITIES/2021/MAY/cm05MAY2021bhav.csv.zip'
This is an invalid link - (as 1st May is a weekend and stock market is closed for the day)
path2 = 'https://archives.nseindia.com/content/historical/EQUITIES/2021/MAY/cm01MAY2021bhav.csv.zip'
This is what I do to extract the data
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile
import pandas as pd
import datetime
start_date = datetime.date(2021, 5, 3)
end_date = datetime.date(2021, 5, 7)
delta = datetime.timedelta(days=1)
final = pd.DataFrame()
while start_date <= end_date:
print(start_date)
day = start_date.strftime('%d')
month = start_date.strftime('%b').upper()
year = start_date.strftime('%Y')
start_date += delta
path = 'https://archives.nseindia.com/content/historical/EQUITIES/' + year + '/' + month + '/cm' + day + month + year + 'bhav.csv.zip'
file = 'cm' + day + month + year + 'bhav.csv'
try:
with urlopen(path) as f:
with BytesIO(f.read()) as b, ZipFile(b) as myzipfile:
foofile = myzipfile.open(file)
df = pd.read_csv(foofile)
final.append(df)
except:
print(file + 'not there')
If the path is invalid, python is stuck and I have to restart Python. I am not able to error handle or identify invalid link while looping over multiple dates.
What I have tried so far to differentiate between valid and invalid links -
# Attempt 1
import os
os.path.exists(path)
os.path.isfile(path)
os.path.isdir(path)
os.path.islink(path)
# output is False for both Path and Path2
# Attempt 2
import validators
validators.url(path)
# output is True for both Path and Path2
# Attempt 3
import requests
site_ping = requests.get(path)
site_ping.status_code < 400
# Output for Path is True, but Python crashes/gets stuck when I run requests.get(path2) and I have to restart everytime.
Thanks for your help in advance.
As suggested by SuperStormer - adding a timeout to the request solved the issue
try:
with urlopen(zipFileURL, timeout = 5) as f:
with BytesIO(f.read()) as b, ZipFile(b) as myzipfile:
foofile = myzipfile.open(file)
df = pd.read_csv(foofile)
final.append(df)
except:
print(file + 'not there')

Converting string with nano seconds to timestamp

Im trying to convert String Datatype to Timestamp data type but Im getting NONE as a result
Sample Data and Code
20181016T192403.635918+02:00
date_format = "yyyyMMdd'T'HHmmss.SSSSSSZ”
data_frame = data_frame.withColumn('dob_ts', unix_timestamp('dob', date_format).cast(‘timestamp’)
Other formats (yyyyMMdd'T'HHmmss.SSS) works fine but not this one.
How to convert this format to timestamp?
You can using udf to define your function. Hence, in the user defined function you can handle this case by an if or what you want:
from pyspark.sql.functions import udf
from datetime import datetime
from pyspark.sql.types import TimestampType
def date_time_to_date(input_date_time):
split_ind = input_date_time.find('T')
new_date = input_date_time
if split_ind > -1:
new_date = input_date_time[:split_ind] + input_date_time[split_ind + 1:]
return datetime.strptime(input_date_time, '%Y%m%d %H%M%S.%f')
udf_date_time_to_date = udf(new_date, TimestampType())
data_frame = data_frame.withColumn('dob_ts', udf_date_time_to_date('dob'))

Resources