Problem statement:
I have 2 columns on streamlit: one for ticker_symbol and other for it's current value.
I want to update the current_value every second (column2) but the code I have so far first removes the written value of the current_price and then writes the new value. I would like the current_value to be overwritten without being removed at all. This also means the last ticker_symbol in the col has to wait for a long time to show it's current value since the previous value gets removed by st.empty.
What can I do to achieve the goal mentioned above?
Should I not use st.empty ? Are there any other alternatives in streamlit ?
import time
import yfinance as yf
import streamlit as st
st.set_page_config(page_title="Test", layout='wide')
stock_list = ['NVDA', 'AAPL', 'MSFT']
left, right, blank_col1, blank_col2, blank_col3, blank_col4, blank_col5, blank_col6, blank_col7, blank_col8, blank_col9, \
blank_col10 = st.columns(12, gap='small')
with left:
for index, val in enumerate(stock_list):
st.write(val)
with right:
while True:
numbers = st.empty()
with numbers.container():
for index, val in enumerate(stock_list):
stock = yf.Ticker(val)
price = stock.info['regularMarketPrice']
# st.write(": ", price)
st.write(": ", price)
time.sleep(0.5)
numbers.empty()
Do like this.
with right:
# The number holder.
numbers = st.empty()
# The infinite loop prevents number holder from being emptied.
while True:
with numbers.container():
for index, val in enumerate(stock_list):
stock = yf.Ticker(val)
price = stock.info['regularMarketPrice']
st.write(": ", price)
time.sleep(0.5)
Sample simulation code with random.
import time
import streamlit as st
import random
st.set_page_config(page_title="Test", layout='wide')
stock_list = ['NVDA', 'AAPL', 'MSFT']
(left, right, blank_col1, blank_col2, blank_col3, blank_col4,
blank_col5, blank_col6, blank_col7, blank_col8, blank_col9,
blank_col10) = st.columns(12, gap='small')
with left:
for index, val in enumerate(stock_list):
st.write(val)
with right:
numbers = st.empty()
while True:
with numbers.container():
for index, val in enumerate(stock_list):
price = random.randint(-100, 100)
st.write(": ", price)
time.sleep(0.5)
Related
So, I'm trying to generate some fake random data of a given dimension size. Essentially, I want a dataframe in which the data has a uniform random distribution. The data consist of both continuous and categorical values. I've written the following code, but it doesn't work the way I want it to be.
import random
import pandas as pd
import time
from datetime import datetime
# declare global variables
adv_name = ['soft toys', 'kitchenware', 'electronics',
'mobile phones', 'laptops']
adv_loc = ['location_1', 'location_2', 'location_3',
'location_4', 'location_5']
adv_prod = ['baby product', 'kitchenware', 'electronics',
'mobile phones', 'laptops']
adv_size = [1, 2, 3, 4, 10]
adv_layout = ['static', 'dynamic'] # advertisment layout type on website
# adv_date, start_time, end_time = []
num = 10 # the given dimension
# define function to generate random advert locations
def rand_shuf_loc(str_lst, num):
lst = adv_loc
# using list comprehension
rand_shuf_str = [item for item in lst for i in range(num)]
return(rand_shuf_str)
# define function to generate random advert names
def rand_shuf_prod(loc_list, num):
rand_shuf_str = [item for item in loc_list for i in range(num)]
random.shuffle(rand_shuf_str)
return(rand_shuf_str)
# define function to generate random impression and click data
def rand_clic_impr(num):
rand_impr_lst = []
click_lst = []
for i in range(num):
rand_impr_lst.append(random.randint(0, 100))
click_lst.append(random.randint(0, 100))
return {'rand_impr_lst': rand_impr_lst, 'rand_click_lst': click_lst}
# define function to generate random product price and discount
def rand_prod_price_discount(num):
prod_price_lst = [] # advertised product price
prod_discnt_lst = [] # advertised product discount
for i in range(num):
prod_price_lst.append(random.randint(10, 100))
prod_discnt_lst.append(random.randint(10, 100))
return {'prod_price_lst': prod_price_lst, 'prod_discnt_lst': prod_discnt_lst}
def rand_prod_click_timestamp(stime, etime, num):
prod_clik_tmstmp = []
frmt = '%d-%m-%Y %H:%M:%S'
for i in range(num):
rtime = int(random.random()*86400)
hours = int(rtime/3600)
minutes = int((rtime - hours*3600)/60)
seconds = rtime - hours*3600 - minutes*60
time_string = '%02d:%02d:%02d' % (hours, minutes, seconds)
prod_clik_tmstmp.append(time_string)
time_stmp = [item for item in prod_clik_tmstmp for i in range(num)]
return {'prod_clik_tmstmp_lst':time_stmp}
def main():
print('generating data...')
# print('generating random geographic coordinates...')
# get the impressions and click data
impression = rand_clic_impr(num)
clicks = rand_clic_impr(num)
product_price = rand_prod_price_discount(num)
product_discount = rand_prod_price_discount(num)
prod_clik_tmstmp = rand_prod_click_timestamp("20-01-2018 13:30:00",
"23-01-2018 04:50:34",num)
lst_dict = {"ad_loc": rand_shuf_loc(adv_loc, num),
"prod": rand_shuf_prod(adv_prod, num),
"imprsn": impression['rand_impr_lst'],
"cliks": clicks['rand_click_lst'],
"prod_price": product_price['prod_price_lst'],
"prod_discnt": product_discount['prod_discnt_lst'],
"prod_clik_stmp": prod_clik_tmstmp['prod_clik_tmstmp_lst']}
fake_data = pd.DataFrame.from_dict(lst_dict, orient="index")
res = fake_data.apply(lambda x: x.fillna(0)
if x.dtype.kind in 'biufc'
# where 'biufc' means boolean, integer,
# unicode, float & complex data types
else x.fillna(random.randint(0, 100)
)
)
print(res.transpose())
res.to_csv("fake_data.csv", sep=",")
# invoke the main function
if __name__ == "__main__":
main()
Problem 1
when I execute the above code snippet, it prints fine but when written to csv format, its horizontally positioned; i.e., it looks like this... How do I position it vertically when writing to csv file? What I want is 7 columns (see lst_dict variable above) with n number of rows?
Problem 2
I dont understand why the random date is generated for the first 50 columns and remaining columns are filled with numerical values?
To answer your first question, replace
print(res.transpose())
with
res.transpose() print(res)
To answer your second question look at the length of the output of the method
rand_shuf_loc()
it as well as the other helper functions only produce a list of 50 items.
The creation of res using the method
fake_data.apply
replaces all nan with a random numeric, so it also applies a numeric to the columns without any predefined values.
I have a database table called 'do_not_call', which contains information about files that hold a range of 10 digit phone numbers in the increasing order. The column 'filename' holds the name of file that contain the range of numbers from 'first_phone' to 'last_phone'. There are about 2500 records in 'do_not_call' table.
And I have a list of sqlalchemy records. I need to find which file is holding the 'phone' field of these records. So I have created a function which takes in the sqlalchemy records and returns a dictionary where the key is the name of file and value is a list of phone numbers from the sqlalchemy records that falls in the range of first and last phone numbers, contained in the file.
def get_file_mappings(dbcursor, import_records):
start_time = datetime.now()
phone_list = [int(rec.phone) for rec in import_records]
dnc_sql = "SELECT * from do_not_call;"
dbcursor.execute(dnc_sql)
dnc_result = dbcursor.fetchall()
file_mappings = {}
for file_info in dnc_result:
first_phone = int(file_info.get('first_phone'))
last_phone = int(file_info.get('last_phone'))
phone_ranges = list(filter(lambda phone: phone in range(first_phone, last_phone), phone_list))
if phone_ranges:
file_mappings.update({file_info.get('filename'): phone_ranges})
phone_list = list(set(phone_list) - set(phone_ranges))
# print(file_mappings)
print("Time = ", datetime.now() - start_time)
return file_mappings
For example if the phone_list is
[2023143300, 2024393100, 2027981539, 2022760321, 2026416368, 2027585911], the file_mappings returned will be
{'1500000_2020-9-24_Global_45A62481-17A2-4E45-82D6-DDF8B58B1BF8.txt': [2023143300, 2022760321],
'1700000_2020-9-24_Global_45A62481-17A2-4E45-82D6-DDF8B58B1BF8.txt': [2024393100],
'1900000_2020-9-24_Global_45A62481-17A2-4E45-82D6-DDF8B58B1BF8.txt': [2027981539, 2026416368, 2027585911]}
The problem here is that it takes a lot of time to execute. On average it takes about 1.5 seconds for 1000 records. Is there a better approach/algorithm to solve this problem. Any help is appreciated.
This is a very inefficient approach to binning things into a sorted list. You are not taking advantage of the fact that your bins are sorted (or could easily be sorted if they were not.) You are making a big nested loop here by testing phone numbers with the lambda statement.
You could make some marginal improvements by being consistent with set use (see below.) But in the end, you could/should just find each phone's place in the listing with an efficient search, like bisection. See example below with timing of original, set implementation, and bisection insertion.
If your phone_list is just massive, then other approaches may be advantageous, such as finding where the cutoff bins fit into a sorted copy of the phone list... but this below is 500x faster than what you have now for 1,000 or 10,000 records
# phone sorter
import random
import bisect
import time
from collections import defaultdict
# make some fake data of representative size
low_phone = 200_000_0000
data = [] # [file, low_phone, high_phone]
for idx in range(2500):
row = []
row.append(f'file_{idx}')
row.append(low_phone + idx * 20000000)
row.append(low_phone + (idx + 1) * 20000000 - 20) # some gap
data.append(row)
high_phone = data[-1][-1]
# generate some random phone numbers in range
num_phones = 10000
phone_list_orig = [random.randint(low_phone, high_phone) for t in range(num_phones)]
# orig method...
phone_list = phone_list_orig[:]
tic = time.time()
results = {}
for row in data:
low = row[1]
high = row[2]
phone_ranges = list(filter(lambda phone: phone in range(low, high), phone_list))
if phone_ranges:
results.update({row[0]:phone_ranges})
phone_list = list(set(phone_list) - set(phone_ranges))
toc = time.time()
print(f'orig time: {toc-tic:.3f}')
# with sets across the board...
phone_list = set(phone_list_orig)
tic = time.time()
results2 = {}
for row in data:
low = row[1]
high = row[2]
phone_ranges = set(filter(lambda phone: phone in range(low, high), phone_list))
if phone_ranges:
results2.update({row[0]:phone_ranges})
phone_list = phone_list - phone_ranges
toc = time.time()
print(f'using sets time: {toc-tic:.3f}')
# using bisection search
phone_list = set(phone_list_orig)
tic = time.time()
results3 = defaultdict(list)
lows = [t[1] for t in data]
for phone in phone_list:
location = bisect.bisect(lows, phone) - 1
if phone <= data[location][2]: # it is within the high limit of bin
results3[data[location][0]].append(phone)
toc = time.time()
print(f'using bisection sort time: {toc-tic:.3f}')
# for k in sorted(results3):
# print(k, ':', results.get(k))
assert(results==results2==results3)
results:
orig time: 5.236
using sets time: 4.597
using bisection sort time: 0.012
[Finished in 9.9s]
I am feeding a long list of inputs in a function that calls an API to retrieve data. My list is around 40.000 unique inputs. Currently, the function returns output every 1-2 seconds or so. Quick maths tells me that it would take over 10+ hrs before my function will be done. I therefore want to speed this process up, but have struggles finding a solution. I am quite a beginner, so threading/pooling is quite difficult for me. I hope someone is able to help me out here.
The function:
import quandl
import datetime
import numpy as np
quandl.ApiConfig.api_key = 'API key here'
def get_data(issue_date, stock_ticker):
# Prepare var
stock_ticker = "EOD/" + stock_ticker
# Volatility
date_1 = datetime.datetime.strptime(issue_date, "%d/%m/%Y")
pricing_date = date_1 + datetime.timedelta(days=-40) # -40 days of issue date
volatility_date = date_1 + datetime.timedelta(days=-240) # -240 days of issue date (-40,-240 range)
# Check if code exists : if not -> return empty array
try:
stock = quandl.get(stock_ticker, start_date=volatility_date, end_date=pricing_date) # get pricing data
except quandl.errors.quandl_error.NotFoundError:
return []
daily_close = stock['Adj_Close'].pct_change() # returns using adj.close
stock_vola = np.std(daily_close) * np.sqrt(252) # annualized volatility
# Average price
stock_pricing_date = date_1 + datetime.timedelta(days=-2) # -2 days of issue date
stock_pricing_date2 = date_1 + datetime.timedelta(days=-12) # -12 days of issue date
stock_price = quandl.get(stock_ticker, start_date=stock_pricing_date2, end_date=stock_pricing_date)
stock_price_average = np.mean(stock_price['Adj_Close']) # get average price
# Amihuds Liquidity measure
liquidity_pricing_date = date_1 + datetime.timedelta(days=-20)
liquidity_pricing_date2 = date_1 + datetime.timedelta(days=-120)
stock_data = quandl.get(stock_ticker, start_date=liquidity_pricing_date2, end_date=liquidity_pricing_date)
p = np.array(stock_data['Adj_Close'])
returns = np.array(stock_data['Adj_Close'].pct_change())
dollar_volume = np.array(stock_data['Adj_Volume'] * p)
illiq = (np.divide(returns, dollar_volume))
print(np.nanmean(illiq))
illiquidity_measure = np.nanmean(illiq, dtype=float) * (10 ** 6) # multiply by 10^6 for expositional purposes
return [stock_vola, stock_price_average, illiquidity_measure]
I then use a seperate script to select my csv file with the list with rows, each row containing the issue_date, stock_ticker
import function
import csv
import tkinter as tk
from tkinter import filedialog
# Open File Dialog
root = tk.Tk()
root.withdraw()
file_path = filedialog.askopenfilename()
# Load Spreadsheet data
f = open(file_path)
csv_f = csv.reader(f)
next(csv_f)
result_data = []
# Iterate
for row in csv_f:
try:
return_data = function.get_data(row[1], row[0])
if len(return_data) != 0:
# print(return_data)
result_data_loc = [row[1], row[0]]
result_data_loc.extend(return_data)
result_data.append(result_data_loc)
except AttributeError:
print(row[0])
print('\n\n')
print(row[1])
continue
if result_data is not None:
with open('resuls.csv', mode='w', newline='') as result_file:
csv_writer = csv.writer(result_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for result in result_data:
# print(result)
csv_writer.writerow(result)
else:
print("No results found!")
It is quite messy, but like I mentioned before, I am definitely a beginner. Speeding this up would greatly help me.
I have used following codes.
from collections import defaultdict
from random import randint, randrange,choice, shuffle
def random_array(low, high, step, size):
lst = []
while len(lst)<size:
nexts = randrange(low, high, step)
if nexts in lst:continue
lst.append(nexts)
return lst
def find_pair_from_two_list(a, b, val):
b_dict = defaultdict(int)
for i,v in enumerate(b): b_dict[v] = i
for v in a:
if (val - v) in b_dict:
return v, val-v
return -1, -1
arr1 = random_array(1, 100, 1, 99)
arr2 = random_array(1, 100, 1, 99)
val1 = choice(arr1)
val2 = choice(arr2)
val = val1 + val2
print(find_pair_from_two_list(arr1,arr2, val))
However if i change size value in
arr1 = random_array(1, 100, 1, 99)
arr2 = random_array(1, 100, 1, 99)
upto 99 it works instantly but if i change any of the size value to 100 or more it just seems to hang in there.
I am curious to know why this is happening.I mean it works well till 99 but what causes it to hang for even 100.
Why is yours slow:
Using arr1 = random_array(1, 100, 1, 100) your method can take lots of time to draw the last missing numbers because you draw new random values over and over and discard them when they are already inside your resultlist:
while len(lst)<size:
nexts = randrange(low, high, step)
if nexts in lst:continue # discards already inside numbers
lst.append(nexts)
return lst
With inputs like this you essentially draw "all" possible numbers until done and the more your result contains the longer it takes to draw another "fitting" one.
You can even produce endless loops if your range(low,high,steps) has less total values then your size demands.
(1,100,5,100) # => only 20 in this range with this stepper -> endless loop
Possible simplification (not optimal)
You could simplyfy and speedup the code by:
import random
def random_array(low, high, step, size):
poss = list(range(low,high,step)) # this does not contain duplicates
random.shuffle(poss) # shuffle it
return poss[:size] # return size (or all) elements from it
print(random_array(1,100,1,10))
This code will return if you specify "wrong" combinations to it, but the resulting list is then shorter as whatever you specified as size.
Even better
jonsharpes suggestion to use
random.sample(range(low,high,step),size)
like so:
def ra(low,high,step,size):
return random.sample(range(low,high,step),size)
Performance test
Performancewise they the random.sample outperforms mine for big lists easily:
import random
def random_array(low, high, step, size):
poss = list(range(low,high,step))
random.shuffle(poss)
return poss[:size]
def ra(low,high,step,size):
return random.sample(range(low,high,step),size)
import timeit
if __name__ == '__main__':
import timeit
# create 100 times 495 randoms of range (1,1000000,22)
print(timeit.timeit("ra(1,1000000,22,495)", setup="from __main__ import ra",number = 10000))
print(timeit.timeit("random_array(1,1000000,22,495)", setup="from __main__ import random_array",number = 10000))
Output:
1.1825043768664596 # random.sample(...) of range(...)
92.12594874871951 # mine
Reason probably being I create actual lists from ranges, random.sample uses ranges with iterators smartly...
Doku:
https://docs.python.org/3.1/library/random.html
https://docs.python.org/3/library/timeit.html
I'm trying to use a slider with a callback in Bokeh using Python 3 to filter the rows of my ColumnDataSource objects (which originate from a DataFrame). More specifically, if a slider with options of 0 to 10000000 (in multiples of 1 million) returns a value N of say 2000000, then I want my plot to only show the data for, in this case, US counties where the population is >= 2000000. Below is my code. Everything works as I want it to except for the slider callback.
from bokeh.io import curdoc
from bokeh.layouts import layout
from bokeh.models import HoverTool, ColumnDataSource, Select, Slider
from bokeh.plotting import figure
TOOLS='pan,wheel_zoom,box_zoom,reset,tap,save,box_select,lasso_select'
source1 = ColumnDataSource(df[df.winner == 'Democratic'])
source2 = ColumnDataSource(df[df.winner == 'Republican'])
hover = HoverTool(
tooltips = [
('County Name', '#county'),
('Population', '#population'),
('Land Area', '#land_area'),
('Pop. Density', '#density'),
('Winning Party', '#winner'),
('Winning Vote %', '#winning_vote_pct'),
]
)
# Plot
plot = figure(plot_width=800, plot_height=450, tools=[hover, TOOLS],
title='2016 US Presidential Vote % vs. Population Density (by County)',
x_axis_label='Vote %', y_axis_label='Population Density (K / sq. mi.)')
y = 'density'
size = 'bokeh_size'
alpha = 0.5
c1 = plot.circle(x='pct_d', y=y, size=size, alpha=alpha, color='blue',
legend='Democratic-Won County', source=source1)
c2 = plot.circle(x='pct_r', y=y, size=size, alpha=alpha, color='red',
legend='Republican-Won County', source=source2)
plot.legend.location = 'top_left'
# Select widget
party_options = ['Show both parties', 'Democratic-won only', 'Republican-won only']
menu = Select(options=party_options, value='Show both parties')
# Slider widget
N = 2000000
slider = Slider(start=0, end=10000000, step=1000000, value=N, title='Population Cutoff')
# Select callback
def select_callback(attr, old, new):
if menu.value == 'Democratic-won only': c1.visible=True; c2.visible=False
elif menu.value == 'Republican-won only': c1.visible=False; c2.visible=True
elif menu.value == 'Show both parties': c1.visible=True; c2.visible=True
menu.on_change('value', select_callback)
# Slider callback
def slider_callback(attr, old, new):
N = slider.value
# NEED HELP HERE...
source1 = ColumnDataSource(df.loc[(df.winner == 'Democratic') & (df.population >= N)])
source2 = ColumnDataSource(df.loc[(df.winner == 'Republican') & (df.population >= N)])
slider.on_change('value', slider_callback)
# Arrange plots and widgets in layouts
layout = layout([menu, slider],
[plot])
curdoc().add_root(layout)
Here is a solution using CustomJSFilter and CDSView as suggest in the other answer by Alex. It does not directly use the data as supplied in the question, but is rather a general hint how this can be implemented:
from bokeh.layouts import column
from bokeh.models import CustomJS, ColumnDataSource, Slider, CustomJSFilter, CDSView
from bokeh.plotting import Figure, show
import numpy as np
# Create some data to display
x = np.arange(200)
y = np.random.random(size=200)
source = ColumnDataSource(data=dict(x=x, y=y))
plot = Figure(plot_width=400, plot_height=400)
# Create the slider that modifies the filtered indices
# I am just creating one that shows 0 to 100% of the existing data rows
slider = Slider(start=0., end=1., value=1., step=.01, title="Percentage")
# This callback is crucial, otherwise the filter will not be triggered when the slider changes
callback = CustomJS(args=dict(source=source), code="""
source.change.emit();
""")
slider.js_on_change('value', callback)
# Define the custom filter to return the indices from 0 to the desired percentage of total data rows. You could also compare against values in source.data
js_filter = CustomJSFilter(args=dict(slider=slider, source=source), code=f"""
desiredElementCount = slider.value * 200;
return [...Array(desiredElementCount).keys()];
""")
# Use the filter in a view
view = CDSView(source=source, filters=[js_filter])
plot.line('x', 'y', source=source, line_width=3, line_alpha=0.6, view=view)
layout = column(slider, plot)
show(layout)
I hope this helps anyone who stumbles upon this in the future! Tested in bokeh 1.0.2
A quick solution with minimal change to your code would be:
def slider_callback(attr, old, new):
N = new # this works also with slider.value but new is more explicit
new1 = ColumnDataSource(df.loc[(df.winner == 'Democratic') & (df.population >= N)])
new2 = ColumnDataSource(df.loc[(df.winner == 'Republican') & (df.population >= N)])
source1.data = new1.data
source2.data = new2.data
When updating data sources, you should replace the data, not the whole object. Here I still create new ColumnDataSource as shortcut. A more direct way (but more verbose too) would be to create the dictionary from the filtered df's columns:
new1 = {
'winner': filtered_df.winner.values,
'pct_d': filtered_df.pct_d.values,
...
}
new2 = {...}
source1.data = new1
source2.data = new2
Note that there's another solution which would make the callback local (not server based) by using a CDSView with a CustomJSFilter. You can also write the other callback with a CDSView as well make the plot completely server-independent.