PySpark GraphFrame and networkx for graphs with hierarchy - apache-spark

I need to create a graph like this which have two relationships, continent-country, country-city. I have 3 columns: city, country, continent, but not sure how to get it into this graph.
Below is an example of another graph with only two columns, country & city. metro_name is the city.
metro = spark.read.csv("metro.csv", header='true').withColumnRenamed("name","metro_name")
country = spark.read.csv("country.csv", header='true').withColumnRenamed("name","country_name")
continent = spark.read.csv("continent.csv", header='true').withColumnRenamed("name","continent_name")
metro_country = spark.read.csv("metro_country.csv", header='true')
country_continent = spark.read.csv("country_continent.csv", header='true')
mc_vertices = country.select(col("country_name").alias("id"))
mc_edges = country.join(metro_country, country.country_id == metro_country.country_id).join(metro, metro_country.metro_id == metro.metro_id).select(col("country_name").alias("src"),col("metro_name").alias("dst"))
mc = GraphFrame(mc_vertices, mc_edges)
# display graph
import networkx as nx
mc_gp = nx.from_pandas_edgelist(mc.edges.toPandas(),'src','dst')
nx.draw(mc_gp, with_labels = True, node_size = 12, font_size = 12, edge_color = "red")
I have tried:
# gets graph that has country, city, continent
mcc_vertices = mc_vertices
mcc_edges = mc_edges.join(cc_edges, mc_edges.src == cc_edges.src).select(mc_edges["src"],mc_edges["dst"],cc_edges["dst"].alias("continent_name"))
mcc = GraphFrame(mcc_vertices, mcc_edges)
# display the graph
mcc_gp = nx.from_pandas_edgelist(mcc.edges.toPandas(),'continent_name','src','dst')
nx.draw(mcc_gp, with_labels = True, node_size = 12, font_size = 12, edge_color = "red")
# gets graph that only has "North America"
northamerica_vertices = mcc_edges.filter(mcc_edges.continent_name == "North America").select(col("src").alias("id")).distinct()
northamerica_edges = mcc_edges.filter(mcc_edges.continent_name == "North America")
northamerica = GraphFrame(northamerica_vertices, northamerica_edges)
northamerica_gp = nx.from_pandas_edgelist(northamerica.edges.toPandas(),'src','dst')
nx.draw(northamerica_gp, with_labels = True, node_size = 40, font_size = 10, edge_color = "red")

Related

I am trying to create a population pyramid graph using Dash with Plotly

i have a directory containing three files, years.csv, 2014.csv and 2015.csv. i want to plot a population pyramid graph for the two files but i want pandas to pick the dataframe from the years.csv with respect to the slider value.
my years.csv looks like, on the slider when i select 2014, from the code you can see, its an int that i convert into a string and append .csv to it. but all i want is that final string interpreted as df = pd.read_csv('2014.csv') so that i can be able to generate graphs of all the years as long as that file is in the directoy.
years
0
2014(2014.csv)
1
2015(2015.csv)
from dash import Dash, dcc, html, Input, Output
# import plotly.express as px
import plotly.graph_objects as gp
import pandas as pd
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminderDataFiveYear.csv')
df = pd.read_csv('years.csv')
app = Dash(__name__)
app.layout = html.Div([
dcc.Graph(id='graph-with-slider'),
dcc.Slider(
df['year'].min(),
df['year'].max(),
step=None,
value=df['year'].min(),
marks={str(year): str(year) for year in df['year'].unique()},
id='year-slider'
)
])
#app.callback(
Output('graph-with-slider', 'figure'),
Input('year-slider', 'value'))
def update_figure(selected_year):
new_df = str(df[df.year == selected_year]) + ".csv"
print(new_df)
# fig = px.scatter(filtered_df, x="gdpPercap", y="lifeExp",
# size="pop", color="continent", hover_name="country",
# log_x=True, size_max=55)
y_age = new_df['Age']
x_M = new_df['Male']
x_F = new_df['Female'] * -1
# fig.update_layout(transition_duration=500)
# Creating instance of the figure
fig = gp.Figure()
# Adding Male data to the figure
fig.add_trace(gp.Bar(y= y_age, x = x_M,
name = 'Male',
orientation = 'h'))
# Adding Female data to the figure
fig.add_trace(gp.Bar(y = y_age, x = x_F,
name = 'Female', orientation = 'h'))
# Updating the layoutout for our graph
fig.update_layout(title = 'Population Pyramid of Uganda-2015',
title_font_size = 22, barmode = 'relative',
bargap = 0.0, bargroupgap = 0,
xaxis = dict(tickvals = [-600000, -400000, -200000,
0, 200000, 400000, 600000],
ticktext = ['6k', '4k', '2k', '0',
'2k', '4k', '6k'],
title = 'Population in Thousands',
title_font_size = 14)
)
# fig.show()
return fig
if __name__ == '__main__':
app.run_server(debug=True)

How to fix 'Dropdown Menu Read' Error in Plotly Dash

I have tried to re-create the following example Towards Data Science Example shown on the web
I have written the following code which I modified to this:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import pandas as pd
import plotly.graph_objs as go
# Step 1. Launch the application
app = dash.Dash()
# Step 2. Import the dataset
filepath = 'https://raw.githubusercontent.com/plotly/datasets/master/finance-charts-apple.csv'
st = pd.read_csv(filepath)
# range slider options
st['Date'] = pd.to_datetime(st.Date)
dates = ['2015-02-17', '2015-05-17', '2015-08-17', '2015-11-17',
'2016-02-17', '2016-05-17', '2016-08-17', '2016-11-17', '2017-02-17']
features = st.columns[1:-1]
opts = [{'label' : i, 'value' : i} for i in features]
# Step 3. Create a plotly figure
trace_1 = go.Scatter(x = st.Date, y = st['AAPL.High'],
name = 'AAPL HIGH',
line = dict(width = 2,
color = 'rgb(229, 151, 50)'))
layout = go.Layout(title = 'Time Series Plot',
hovermode = 'closest')
fig = go.Figure(data = [trace_1], layout = layout)
# Step 4. Create a Dash layout
app.layout = html.Div([
# a header and a paragraph
html.Div([
html.H1("This is my first dashboard"),
html.P("Dash is so interesting!!")
],
style = {'padding' : '50px' ,
'backgroundColor' : '#3aaab2'}),
# adding a plot
dcc.Graph(id = 'plot', figure = fig),
# dropdown
html.P([
html.Label("Choose a feature"),
dcc.Dropdown(
id='opt',
options=opts,
value=features[0],
multi=True
),
# range slider
html.P([
html.Label("Time Period"),
dcc.RangeSlider(id = 'slider',
marks = {i : dates[i] for i in range(0, 9)},
min = 0,
max = 8,
value = [1, 7])
], style = {'width' : '80%',
'fontSize' : '20px',
'padding-left' : '100px',
'display': 'inline-block'})
])
])
# Step 5. Add callback functions
#app.callback(Output('plot', 'figure'),
[Input('opt', 'value'),
Input('slider', 'value')])
def update_figure(input1, input2):
# filtering the data
st2 = st[(st.Date > dates[input2[0]]) & (st.Date < dates[input2[1]])]
# updating the plot
trace_1 = go.Scatter(x = st2.Date, y = st2['AAPL.High'],
name = 'AAPL HIGH',
line = dict(width = 2,
color = 'rgb(229, 151, 50)'))
trace_2 = go.Scatter(x = st2.Date, y = st2[input1],
name = str(input1),
line = dict(width = 2,
color = 'rgb(106, 181, 135)'))
fig = go.Figure(data = [trace_1, trace_2], layout = layout)
return fig
# Step 6. Add the server clause
if __name__ == '__main__':
app.run_server(debug = True)
When I change the feature input, it does not update the plot correctly and does not show the selected features in the plot.
Either there is something wrong with the callback function or the initialization of the graph with the second trace. But I cant figure out where the issue is.
As you are only providing two scatter traces within your callback. From both, one is static for 'AAPL.High'. So you need to limit the dropdown values to Multi=False.
Valid plots are only generated for choosing options like 'AAPL.LOW' and others like dic won't display a second trace. The callback wouldn't terminate if you would keepmulti=True the callback would stil work, if always only one option is selected. The moment you select two or more options the script will fail as it would try to find faulty data for the data return block here:
trace_2 = go.Scatter(x = st2.Date, y = st2[**MULTIINPUT**],
name = str(input1),
line = dict(width = 2,
color = 'rgb(106, 181, 135)'))
Only one column id is allowed to be passed at MULTIINPUT. If you want to introduce more traces please use a for loop.
Change the code to the following:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import pandas as pd
import plotly.graph_objs as go
# Step 1. Launch the application
app = dash.Dash()
# Step 2. Import the dataset
filepath = 'https://raw.githubusercontent.com/plotly/datasets/master/finance-charts-apple.csv'
st = pd.read_csv(filepath)
# range slider options
st['Date'] = pd.to_datetime(st.Date)
dates = ['2015-02-17', '2015-05-17', '2015-08-17', '2015-11-17',
'2016-02-17', '2016-05-17', '2016-08-17', '2016-11-17', '2017-02-17']
features = st.columns
opts = [{'label' : i, 'value' : i} for i in features]
# Step 3. Create a plotly figure
trace_1 = go.Scatter(x = st.Date, y = st['AAPL.High'],
name = 'AAPL HIGH',
line = dict(width = 2,
color = 'rgb(229, 151, 50)'))
layout = go.Layout(title = 'Time Series Plot',
hovermode = 'closest')
fig = go.Figure(data = [trace_1], layout = layout)
# Step 4. Create a Dash layout
app.layout = html.Div([
# a header and a paragraph
html.Div([
html.H1("This is a Test Dashboard"),
html.P("Dash is great!!")
],
style = {'padding' : '50px' ,
'backgroundColor' : '#3aaab2'}),
# adding a plot
dcc.Graph(id = 'plot', figure = fig),
# dropdown
html.P([
html.Label("Choose a feature"),
dcc.Dropdown(
id='opt',
options=opts,
value=features[0],
multi=False
),
# range slider
html.P([
html.Label("Time Period"),
dcc.RangeSlider(id = 'slider',
marks = {i : dates[i] for i in range(0, 9)},
min = 0,
max = 8,
value = [1, 7])
], style = {'width' : '80%',
'fontSize' : '20px',
'padding-left' : '100px',
'display': 'inline-block'})
])
])
# Step 5. Add callback functions
#app.callback(Output('plot', 'figure'),
[Input('opt', 'value'),
Input('slider', 'value')])
def update_figure(input1, input2):
# filtering the data
st2 = st#[(st.Date > dates[input2[0]]) & (st.Date < dates[input2[1]])]
# updating the plot
trace_1 = go.Scatter(x = st2.Date, y = st2['AAPL.High'],
name = 'AAPL HIGH',
line = dict(width = 2,
color = 'rgb(229, 151, 50)'))
trace_2 = go.Scatter(x = st2.Date, y = st2[input1],
name = str(input1),
line = dict(width = 2,
color = 'rgb(106, 181, 135)'))
fig = go.Figure(data = [trace_1, trace_2], layout = layout)
return fig
# Step 6. Add the server clause
if __name__ == '__main__':
app.run_server(debug = True)
I hope this cleared things up and solved your issues. :)

How do I add vertical moving hover line to my plotly chart

I am trying to achieve what is done here: https://www.quantalys.com/Fonds/120955 with javascript in python plotly. I want to add the hover vertical line and the red annotation on the x axis. I have done some searching on goolgle but I couldn't find the the answer I'm looking for. My current chart looks like this:
trace1 = go.Scatter(
x = df1.x,
y = df1.y,
name = "M&G OPTIMAL INCOME FD EUR AH ACC",
hoverinfo= 'name',
opacity=0.7,
mode = 'lines',
line = dict(
color = ('rgb(2, 12, 245)'),
width = 1,
),
)
trace2 = go.Scatter(
x = df2.x,
y = df2.y,
opacity=0.7,
name = "Alloc Flexible Prudent Monde",
hoverinfo= 'name',
mode = 'lines',
line = dict(
color = ('rgb(67, 45, 24)'),
width = 1,
)
)
trace3 = go.Scatter(
x = df3.x,
y = df3.y,
name = "25% MSCI World + 75% ML Global",
hoverinfo= 'name',
mode = 'lines',
opacity=0.7,
line = dict(
color = ('rgb(205, 12, 24)'),
width = 1,
)
)
layout = go.Layout(
xaxis=dict(
showline=True,
showgrid=True,
showticklabels=True,
linecolor='rgb(204, 204, 204)',
linewidth=2,
mirror=True,
),
yaxis=dict(
showline=True,
showgrid=True,
showticklabels=True,
linecolor='rgb(204, 204, 204)',
linewidth=2,
mirror=True,
),
showlegend=True,
)
data= [trace1, trace2,trace3]
fig = dict(data=data, layout=layout)
iplot(fig, filename='line-mode')
Add this to your layout definition.
showlegend = True,
hovermode = 'x'
Add this to your xaxis definition.
showspikes = True,
spikemode = 'across',
spikesnap = 'cursor',
showline=True,
showgrid=True,
...
And add this to your layout definition:
spikedistance = -1,
xaxis=dict(...
Please refer to this post and the documentation by plotly. :)
EDIT
You ask for the x-axis lable. Please use
spikemode = 'across+toaxis'
Additionally I would suggest to use
spikedash = 'solid'
because it is better fitting your example.

How can I use the plotly dropdown menu feature to update the z value in my choropleth map?

I just want to create a menu on the plot where I'm able to change the z-value in data only. I tried looking at other examples on here: https://plot.ly/python/dropdowns/#restyle-dropdown but it was hard since the examples were not exactly similar to my plot.
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_world_gdp_with_codes.csv')
data = [go.Choropleth(
locations = df['CODE'],
z = df['GDP (BILLIONS)'],
text = df['COUNTRY'],
colorscale = [
[0, "rgb(5, 10, 172)"],
[0.35, "rgb(40, 60, 190)"],
[0.5, "rgb(70, 100, 245)"],
[0.6, "rgb(90, 120, 245)"],
[0.7, "rgb(106, 137, 247)"],
[1, "rgb(220, 220, 220)"]
],
autocolorscale = False,
reversescale = True,
marker = go.choropleth.Marker(
line = go.choropleth.marker.Line(
color = 'rgb(180,180,180)',
width = 0.5
)),
colorbar = go.choropleth.ColorBar(
tickprefix = '$',
title = 'GDP<br>Billions US$'),
)]
layout = go.Layout(
title = go.layout.Title(
text = '2014 Global GDP'
),
geo = go.layout.Geo(
showframe = False,
showcoastlines = False,
projection = go.layout.geo.Projection(
type = 'equirectangular'
)
),
annotations = [go.layout.Annotation(
x = 0.55,
y = 0.1,
xref = 'paper',
yref = 'paper',
text = 'Source: <a href="https://www.cia.gov/library/publications/the-world-factbook/fields/2195.html">\
CIA World Factbook</a>',
showarrow = False
)]
)
fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename = 'd3-world-map')
It's been a while since this was asked, but I figured it was still worth answering. I can't speak to how this might have changed since it was asked in 2019, but this works today.
First, I'll provide the code I used to create the new z values and the dropdown menu, then I'll provide all of the code I used to create these graphs in one chunk (easier to cut and paste...and all that).
This is the data I used for the alternate data in the z field.
import plotly.graph_objects as go
import pandas as pd
import random
z2 = df['GDP (BILLIONS)'] * .667 + 12
random.seed(21)
random.shuffle(z2)
df['z2'] = z2 # example as another column in df
print(df.head()) # validate as expected
z3 = df['GDP (BILLIONS)'] * .2 + 1000
random.seed(231)
random.shuffle(z3) # example as a series outside of df
z4 = df['GDP (BILLIONS)']**(1/3) * df['GDP (BILLIONS)']**(1/2)
random.seed(23)
random.shuffle(z4)
z4 = z4.tolist() # example as a basic Python list
To add buttons to change z, you'll add updatemenus to your layout. Each dict() is a separate dropdown option. At a minimum, each button requires a method, a label, and args. These represent what is changing (method for data, layout, or both), what it's called in the dropdown (label), and the new information (the new z in this example).
args for changes to data (where the method is either restyle or update) can also include the trace the change applies to. So if you had a bar chart and a line graph together, you may have a button that only changes the bar graph.
Using the same structure you have:
updatemenus = [go.layout.Updatemenu(
x = 1, xanchor = 'right', y = 1.15, type = "dropdown",
pad = {'t': 5, 'r': 20, 'b': 5, 'l': 30}, # around all buttons (not indiv buttons)
buttons = list([
dict(
args = [{'z': [df['GDP (BILLIONS)']]}], # original data; nest data in []
label = 'Return to the Original z',
method = 'restyle' # restyle is for trace updates
),
dict(
args = [{'z': [df['z2']]}], # nest data in []
label = 'A different z',
method = 'restyle'
),
dict(
args = [{'z': [z3]}], # nest data in []
label = 'How about this z?',
method = 'restyle'
),
dict(
args = [{'z': [z4]}], # nest data in []
label = 'Last option for z',
method = 'restyle'
)])
)]
All code used to create this graph in one chunk (includes code shown above).
import plotly.graph_objs as go
import pandas as pd
import ssl
import random
# to collect data without an error
ssl._create_default_https_context = ssl._create_unverified_context
# data used in plot
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_world_gdp_with_codes.csv')
# z values used in buttons
z2 = df['GDP (BILLIONS)'] * .667 + 12
random.seed(21)
random.shuffle(z2)
df['z2'] = z2 # example as another column in the data frame
print(df.head()) # validate as expected
z3 = df['GDP (BILLIONS)'] * .2 + 1000
random.seed(231)
random.shuffle(z3) # example as a series outside of the data frame
z4 = df['GDP (BILLIONS)']**(1/3) * df['GDP (BILLIONS)']**(1/2)
random.seed(23)
random.shuffle(z4)
z4 = z4.tolist() # example as a basic Python list
data = [go.Choropleth(
locations = df['CODE'], z = df['GDP (BILLIONS)'], text = df['COUNTRY'],
colorscale = [
[0, "rgb(5, 10, 172)"],
[0.35, "rgb(40, 60, 190)"],
[0.5, "rgb(70, 100, 245)"],
[0.6, "rgb(90, 120, 245)"],
[0.7, "rgb(106, 137, 247)"],
[1, "rgb(220, 220, 220)"]],
reversescale = True,
marker = go.choropleth.Marker(
line = go.choropleth.marker.Line(
color = 'rgb(180,180,180)', width = 0.5)),
colorbar = go.choropleth.ColorBar(
tickprefix = '$',
title = 'GDP<br>Billions US$',
len = .6) # I added this for aesthetics
)]
layout = go.Layout(
title = go.layout.Title(text = '2014 Global GDP'),
geo = go.layout.Geo(
showframe = False, showcoastlines = False,
projection = go.layout.geo.Projection(
type = 'equirectangular')
),
annotations = [go.layout.Annotation(
x = 0.55, y = 0.1, xref = 'paper', yref = 'paper',
text = 'Source: <a href="https://www.cia.gov/library/publications/the-world-factbook/fields/2195.html">\
CIA World Factbook</a>',
showarrow = False
)],
updatemenus = [go.layout.Updatemenu(
x = 1, xanchor = 'right', y = 1.15, type = "dropdown",
pad = {'t': 5, 'r': 20, 'b': 5, 'l': 30},
buttons = list([
dict(
args = [{'z': [df['GDP (BILLIONS)']]}], # original data; nest data in []
label = 'Return to the Original z',
method = 'restyle' # restyle is for trace updates only
),
dict(
args = [{'z': [df['z2']]}], # nest data in []
label = 'A different z',
method = 'restyle'
),
dict(
args = [{'z': [z3]}], # nest data in []
label = 'How about this z?',
method = 'restyle'
),
dict(
args = [{'z': [z4]}], # nest data in []
label = 'Last option for z',
method = 'restyle'
)])
)]
)
fig = go.Figure(data = data, layout = layout)
fig.show()

How to do a threshold line on a bar chart using plotly

Currently written this code that produces a bar chart but would like to add a threshold line. Could anyone help me please?
def make_bar_chart(data):
"""Takes a list of dicts with a time and price"""
# Times
chart_x = []
# Prices
chart_y = []
# Create the relevant arrays
for item in data:
chart_x.append(item["time"])
chart_y.append(item["price"])
# Make the chart
the_graph = Bar(x = chart_x, y = chart_y , name = "Stocks")
graph_data = Data([the_graph])
the_layout = Layout(title = "Stocks", xaxis = dict(title = "Time"), yaxis = dict(title = "Price"))
the_figure = Figure(data = graph_data, layout = the_layout)
plotly.offline.plot(the_figure, filename = "stocks.html")
Try something like this. In plotly it seems that lines are provided via shapes.
the_layout = Layout(title = "Stocks",
xaxis = dict(title = "Time"),
yaxis = dict(title = "Price"),
shapes=[
{
'type': 'line',
'xref': 'paper',
'x0': 0,
'y0': 100, # use absolute value or variable here
'x1': 1,
'y1': 100, # ditto
'line': {
'color': 'rgb(50, 171, 96)',
'width': 1,
'dash': 'dash',
},
},
],
)
I haven't tested this as you haven't provided sample data. Well done for supplying code on your first question, but on Stack Overflow it's best to provide a completely self-contained example that people can copy and run 'as is.'

Resources