memory error on web.datareader using pandas - python-3.x

I have a block of code that functions, however, I get memory errors or extremely long run times, is there a more elegant solution that requires less memory or can be run in a shorter time period?
import pandas as pd
import pandas.io.data as web
import datetime
#Grabs tickers from html
exchList = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies', infer_types=False)
sp500 = []
for ticker in exchList[0][0][1:]:
sp500.append(ticker)
sp500 = [w.replace('.','-') for w in sp500]
#sets date for data fetch
start = datetime.datetime(2000,1,1)
end = datetime.date.today()
#fetches data from yahoo and prints to csv
p = web.DataReader(sp500, "yahoo", start, end)
main_df = p.to_frame()
noIndex = main_df.reset_index()
noIndex.columns.values[1]= 'Name'
indexed = noIndex.set_index('Date')
csv = indexed.to_csv('edata.csv')

import pandas.io.data is deprecated in modern Pandas versions:
In [113]: import pandas.io.data
...
ImportError: The pandas.io.data module is moved to a separate package (pandas-datareader). After installing the pandas-datareader package (h
ttps://github.com/pandas-dev/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader im
port data, wb``.
So we should use pandas_datareader instead:
from pandas_datareader import data as web
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
sp500 = pd.read_html(url)[0].iloc[1:, 0].str.replace('\.', '-')
df = web.DataReader(sp500, "yahoo", '2000-01-01').to_frame()
Memory usage:
In [112]: df.memory_usage()
Out[112]:
Index 7974167
Open 15870936
High 15870936
Low 15870936
Close 15870936
Volume 15870936
Adj Close 15870936
dtype: int64
Execution time:
In [115]: %timeit -n 1 -r 1 web.DataReader(sp500, "yahoo", '2000-01-01').to_frame()
1 loop, best of 1: 1min 57s per loop

Related

pandas datareader. Save all data to one dataframe

I am new to Python and I have trouble getting data into one dataframe.
I have the following code.
from pandas_datareader import data as pdr
from datetime import date
from datetime import timedelta
import yfinance as yf
yf.pdr_override()
import pandas as pd
# tickers list
ticker_list = ['0P0001A532.CO','0P00018Q4V.CO','0P00017UBI.CO','0P00000YYT.CO','PFIBAA.CO','PFIBAB.CO','PFIBAC.CO','PFIDKA.CO','PFIGLA.CO','PFIMLO.CO','PFIKRB.CO','0P00019SMI.F','WEKAFKI.CO','0P0001CICW.CO','WEISTA.CO','WEISTS.CO','WEISA.CO','WEITISOP.CO']
today = date.today()
# We can get data by our choice by days bracket
if date.today().weekday()==0:
start_date = (today + timedelta((4 + today.weekday()) % 7)) - timedelta(days=7) # Friday. If it is monday we do not have a price since it is based on the previous day close.
else:
start_date = today - timedelta(days=1)
files=[]
allData = []
dafr_All = []
def getData(ticker):
print(ticker)
data = pdr.get_data_yahoo(ticker, start= start_date, end=(today + timedelta(days=2)))['Adj Close']
dataname = ticker+'_'+str(today)
files.append(dataname)
allData.append(data)
SaveData(data, dataname)
# Create a data folder in your current dir.
def SaveData(df, filename):
df.to_csv('./data/'+filename+'.csv')
#This loop will iterate over ticker list, will pass one ticker to get data, and save that data as file.
for tik in ticker_list:
getData(tik)
for i in range(0,11):
df1= pd.read_csv('./data/'+ str(files[i])+'.csv')
print (df1.head())
I get several csv files containing the adjusted close values (if there exists an adjusted close).
I want to save all the data to a dataframe where the first column consist of tickers, while the second column consist of adjusted close values. The dataframe then needs to be exported into a csv-file.

Python code to get MFI indicator from Binance futures market, wrong calculation

I'm starting in python and I'm trying to customize a code I got from the internet to get the MFI indicator of cryptocurrencies in the futures market of the Binance broker. The Code is working, but the calculation it does for the MFI is not matching what is shown on Binance. Did I make the calculation formula wrong?
my code is this, it returns a list of the last MFI values, the last would be the current one, if anyone knows and can help
import warnings
import numpy as np
import pandas as pd
from stockstats import StockDataFrame
import time
import config
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
warnings.filterwarnings('ignore')
symbol='ADAUSDT'
timeinterval=5
while(1==1):
url = 'https://fapi.binance.com/fapi/v1/klines?symbol='+symbol+'&interval='+str(timeinterval)+'m'+'&limit=100'
data = requests.get(url).json()
D = pd.DataFrame(data)
D.columns = ['open_time', 'open', 'high', 'low', 'close', 'volume', 'close_time', 'qav', 'num_trades', 'taker_base_vol', 'taker_quote_vol', 'is_best_match']
typical_price = (D['close'].astype(float) + D['high'].astype(float) + D['low'].astype(float)) / 3
period = 14
money_flow = typical_price * D['volume'].astype(float)
positive_flow =[]
negative_flow = []
for i in range(1, len(typical_price)):
if typical_price[i] > typical_price[i-1]:
positive_flow.append(money_flow[i-1])
negative_flow.append(0)
elif typical_price[i] < typical_price[i-1]:
negative_flow.append(money_flow[i-1])
positive_flow.append(0)
else:
positive_flow.append(0)
negative_flow.append(0)
positive_mf =[]
negative_mf =[]
for i in range(period-1, len(positive_flow)):
positive_mf.append(sum(positive_flow[i+1-period : i+1]))
for i in range(period-1, len(negative_flow)):
negative_mf.append(sum(negative_flow[i+1-period : i+1]))
mfi = 100 * (np.array(positive_mf) / (np.array(positive_mf) + np.array(negative_mf) ))
print(mfi)```

Getting the elements of list in specific range in Python using negative indexing

Input
list1 = ['Apple','Google','MS','Facebook']
print(list1)
list1[-4:1]
Output
['Apple', 'Google', 'MS', 'Facebook']
['Apple']
Can anyone please explain the result?
When you use negative indexing, you start at index -1. It would seem silly to say list1[-0] and have it be different than list1[0]. Because of this your code becomes "grab the elements starting from the 4th to last and going to 1". Another way to think of it is list1[-4] is the same as list1[len(list1) - 4]. So for this you're going in the range [0, 1) and only returning the first element.
This may help you.
from math import sqrt
from sklearn.cluster import MiniBatchKMeans
import pandas_datareader as dr
from matplotlib import pyplot as plt
import pandas as pd
import matplotlib.cm as cm
import seaborn as sn
start = '2020-1-1'
end = '2021-1-1'
tickers = ['AXP','AAPL','BA','CAT','CSCO','CVX','XOM','GS']
prices_list = []
for ticker in tickers:
try:
prices = dr.DataReader(ticker,'yahoo',start)['Adj Close']
prices = pd.DataFrame(prices)
prices.columns = [ticker]
prices_list.append(prices)
except:
pass
prices_df = pd.concat(prices_list,axis=1)
prices_df.sort_index(inplace=True)
prices_df.head()
index -4 means 4th from last element, so list1[-4:1] is the same as list1[0:1], which is the same as [list1[0]] which is [`Apple`]
( To give another example, list1[1:-1] == list1[1:3] == [list1[1],list1[2]] == [`Google`,`MS`] )

AttributeError: 'datetime.datetime' object has no attribute 'striftime'

I am currently writing a machine learning program for school to predict the weather. I have been using this article https://stackabuse.com/using-machine-learning-to-predict-the-weather-part-1/ as my main resource (I have had to adjust as wunderground is no longer free so I have instead been using openweathermap). I was writing the data collection and organization part of my code I received the following error 'AttributeError: 'datetime.datetime' object has no attribute 'striftime'. Sorry in advance for the massive block of code, I figured it would be the best way to troubleshoot the problem. Thank you for any the help. The parts with '** code **' are what I am struggling with
from datetime import datetime
from datetime import timedelta
import time
from collections import namedtuple
import pandas as pd
import requests
import matplotlib.pyplot as plt
#Data collection and Organization
url = 'http://history.openweathermap.org//storage/d12a3df743e650ba4035d2c6d42fb68f.json'
#res = requests.get(url)
#data = res.json()
target_date = datetime(2018, 4, 22)
features = ["date", "temperature", "pressure", "humidity", "maxtemperature", "mintemperature"]
DailySummary = namedtuple("DailySummary", features)
def extra_weather_data(url, target_date, days):
for _ in range(days):
**request = url.format(target_date.striftime('%Y%m%d'))**
respone = requests.get(request)
if response.status_code == 200:
data = response.json()
records.append(DailySummary(
date = target_date,
temperature = data['main']['temp'],
pressure = data['main']['pressure'],
humidity = data['main']['humidity'],
maxtemperature = data['main']['temp_max'],
mintemperature = data['main']['temp_min']))
time.sleep(6)
target_date += timedelta(days=1)
**records = extra_weather_data(url, target_date, 365)**
#Finished data collection now begin to clean and process data using Pandas
df = pd.DataFrame(records, columns=features).set_index('date')
tmp = df[['temperature','pressure','humidty', 'maxtemperature', 'mintemperature']].head(10)
def derive_nth_day_feature(df, feature, N):
rows =df.shape[0]
nth_prior_measurements = [None]*N + [df[feature][i-N] for i in range(N,rows)]
col_name = "{}_{}".format(feature, N)
df[col_name] = nth_prior_measurements
for feature in features:
if feature != 'date':
for N in range(1, 4):
derive_nth_day_feature(df, feature, N)
df.columns

Error Unorderable Types. Select rows where on hdf5 files

I work with python 3.5 and I have the next problem to import some datas from a hdf5 files.
I will show a very simple example which resume what happen. I have created a small dataframe and I have inserted it into a hdf5 files. Then I have tried to select from this hdf5 file the rows which have on the column "A" a value less that 1. So I get the error:
"Type error: unorderable types: str() < int()"
image
import pandas as pd
import numpy as np
import datetime
import time
import h5py
from pandas import DataFrame, HDFStore
def test_conected():
hdf_nombre_archivo ="1_Archivo.h5"
hdf = HDFStore(hdf_nombre_archivo)
np.random.seed(1234)
index = pd.date_range('1/1/2000', periods=3)
df = pd.DataFrame(np.random.randn(3, 4), index=index, columns=
['A', 'B','C','F'])
print(df)
with h5py.File(hdf_nombre_archivo) as f:
df.to_hdf(hdf_nombre_archivo, 'df',format='table')
print("")
with h5py.File(hdf_nombre_archivo) as f:
df_nuevo = pd.read_hdf(hdf_nombre_archivo, 'df',where= ['A' < 1])
print(df_nuevo )
def Fin():
print(" ")
print("FIN")
if __name__ == "__main__":
test_conected()
Fin()
print(time.strftime("%H:%M:%S"))
I have been investigating but I dont get to solve this error. Some idea?
Thanks
Angel
where= ['A' < 1]
in your condition statement 'A' is consider as string or char and 1 is int so first make them in same type by typecasting.
ex:
str(1)

Resources