Create new "timezone" column from TimezoneFinder() with longitude and latitude columns as input in PySpark - apache-spark

I want to create a new column that contains the timezone of the equivalent longitude and latitude. The longitude and latitude from already existing columns is the input in the timezonefinder function i.e. get_timezone(). I keep getting TypeError: an integer is required (got type Column)
Thanks.
from timezonefinder import TimezoneFinder
def get_timezone(longitude, latitude):
tzf = TimezoneFinder()
return tzf.timezone_at(lng=longitude, lat=latitude)
location_table = location_table.withColumn("timezone", get_timezone(location_table["location_longitude"], location_table["location_latitude"]))
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<command-253463262459944> in <module>
8
9 # df = sqlContext.read.parquet(INPUT)
---> 10 location_table.withColumn("timezone", get_timezone(location_table["location_longitude"].cast(IntegerType()), location_table["location_latitude"].cast(IntegerType())))
11 # .write.parquet(OUTPUT)
<command-253463262459944> in get_timezone(longitude, latitude)
3 def get_timezone(longitude, latitude):
4 tzf = TimezoneFinder()
----> 5 return tzf.timezone_at(lng=longitude, lat=latitude)
6
7 # udf_timezone = F.udf(get_timezone, StringType())
/databricks/python/lib/python3.7/site-packages/timezonefinder/timezonefinder.py in timezone_at(self, lng, lat)
657 :return: the timezone name of the matched timezone polygon. possibly "Etc/GMT+-XX" in case of an ocean timezone.
658 """
--> 659 lng, lat = rectify_coordinates(lng, lat)
660
661 shortcut_id_x, shortcut_id_y = coord2shortcut(lng, lat)
TypeError: an integer is required (got type Column)

You need to convert the function to a UDF first:
import pyspark.sql.functions as F
from timezonefinder import TimezoneFinder
#F.udf('string')
def get_timezone(longitude, latitude):
if longitude is None or latitude is None:
return None
tzf = TimezoneFinder()
return tzf.timezone_at(lng=longitude, lat=latitude)
location_table = location_table.withColumn("timezone", get_timezone(location_table["location_longitude"], location_table["location_latitude"]))

Related

How to subset a xarray.Dataset according to lat/lon values taken from a SRTM DEM extents

I have a year wise (1980-2020) precipitation data set in netCDF format. I am importing them in xarray to have 40 years of merged precipitation values:
import netCDF4
import numpy
import xarray as xr
import pandas as pd
prcp=xr.open_mfdataset('/home/hrsa/Sayantan/HAR_V2/prcp/HARv2_d10km_d_2d_prcp_*.nc',combine = 'nested', concat_dim="time")
prcp
which renders:
xarray.Dataset
Dimensions:
time: 14976west_east: 381south_north: 252
Coordinates:
time
(time)
datetime64[ns]
1980-01-01 ... 2020-12-31
west_east
(west_east)
float32
-1.675e+06 -1.665e+06 ... 2.125e+06
south_north
(south_north)
float32
-7.45e+05 -7.35e+05 ... 1.765e+06
lon
(south_north, west_east)
float32
dask.array<chunksize=(252, 381), meta=np.ndarray>
lat
(south_north, west_east)
float32
dask.array<chunksize=(252, 381), meta=np.ndarray>
Data variables:
prcp
(time, south_north, west_east)
float32
dask.array<chunksize=(366, 252, 381), meta=np.ndarray>
Attributes: (33)
This a large dataset, hence I am required to subset it according to an SRTM image whose extents (in EPSG:4326) is defined as
# Extents of the SRTM DEM covering Panchi_B and the SASE AWS/Base Camp
min_lon = 77.0
min_lat = 32.0
max_lon = 78.0
max_lat = 33.0
In order to subset according to above coordinates I have tried the following:
prcp = prcp.sel(lat = slice(min_lat,max_lat), lon = slice(min_lon,max_lon))
the Error output:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
File ~/.pyenv/versions/3.9.7/envs/v3.9.7/lib/python3.9/site-packages/xarray/core/indexing.py:73, in group_indexers_by_index(data_obj, indexers, method, tolerance)
72 try:
---> 73 index = xindexes[key]
74 coord = data_obj.coords[key]
KeyError: 'lat'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
Input In [25], in <cell line: 1>()
----> 1 prcp = prcp.sel(lat = slice(min_lat,max_lat), lon = slice(min_lon,max_lon))
File ~/.pyenv/versions/3.9.7/envs/v3.9.7/lib/python3.9/site-packages/xarray/core/dataset.py:2501, in Dataset.sel(self, indexers, method, tolerance, drop, **indexers_kwargs)
2440 """Returns a new dataset with each array indexed by tick labels
2441 along the specified dimension(s).
2442
(...)
2498 DataArray.sel
2499 """
2500 indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "sel")
-> 2501 pos_indexers, new_indexes = remap_label_indexers(
2502 self, indexers=indexers, method=method, tolerance=tolerance
2503 )
2504 # TODO: benbovy - flexible indexes: also use variables returned by Index.query
2505 # (temporary dirty fix).
2506 new_indexes = {k: v[0] for k, v in new_indexes.items()}
File ~/.pyenv/versions/3.9.7/envs/v3.9.7/lib/python3.9/site-packages/xarray/core/coordinates.py:421, in remap_label_indexers(obj, indexers, method, tolerance, **indexers_kwargs)
414 indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "remap_label_indexers")
416 v_indexers = {
417 k: v.variable.data if isinstance(v, DataArray) else v
418 for k, v in indexers.items()
419 }
--> 421 pos_indexers, new_indexes = indexing.remap_label_indexers(
422 obj, v_indexers, method=method, tolerance=tolerance
423 )
424 # attach indexer's coordinate to pos_indexers
425 for k, v in indexers.items():
File ~/.pyenv/versions/3.9.7/envs/v3.9.7/lib/python3.9/site-packages/xarray/core/indexing.py:110, in remap_label_indexers(data_obj, indexers, method, tolerance)
107 pos_indexers = {}
108 new_indexes = {}
--> 110 indexes, grouped_indexers = group_indexers_by_index(
111 data_obj, indexers, method, tolerance
112 )
114 forward_pos_indexers = grouped_indexers.pop(None, None)
115 if forward_pos_indexers is not None:
File ~/.pyenv/versions/3.9.7/envs/v3.9.7/lib/python3.9/site-packages/xarray/core/indexing.py:84, in group_indexers_by_index(data_obj, indexers, method, tolerance)
82 except KeyError:
83 if key in data_obj.coords:
---> 84 raise KeyError(f"no index found for coordinate {key}")
85 elif key not in data_obj.dims:
86 raise KeyError(f"{key} is not a valid dimension or coordinate")
KeyError: 'no index found for coordinate lat'
How can I resolve this issue? Any help will be appreciated, Thank you.
############# Edit (for #Robert Wilson) ##################
In order to find out the ranges, I did the following:
lon = prcp.lon.to_dataframe()
lon
lat = prcp.lat.to_dataframe()
lat

Performing a Principal Component Analysis to reconstruct time series creates more values than expected

I want to do a Principal Component Analysis following this notebook to reconstruct the DJIA (I'm using alpha_ventage) from its components (found with Quandl). Yet, it seems that I create more values than expected, than the original dataframe, when reconstructing the values multiplying the principal components by their weights
kernel_pca = KernelPCA(n_components=5).fit(df_z_components)
pca_5 = kernel_pca.transform(-daily_df_components)
weights = fn_weighted_average(kernel_pca.lambdas_)
reconstructed_values = np.dot(pca_5, weights)
Indeed, daily_df_components is created from the components of the DJIA by the quandl API which seem to have more data than the library I use to get the DJIA Index, alpha_ventage.
Here is the full code
"""
Obtaining the components data from quandl
"""
import quandl
QUANDL_API_KEY = 'MYKEY'
quandl.ApiConfig.api_key = QUANDL_API_KEY
SYMBOLS = [
'AAPL', 'MMM', 'BA', 'AXP', 'CAT',
'CVX', 'CSCO', 'KO', 'DD', 'XOM',
'GS', 'HD', 'IBM', 'INTC', 'JNJ',
'JPM', 'MCD', 'MRK', 'MSFT', 'NKE',
'PFE', 'PG', 'UNH', 'UTX', 'TRV',
'VZ', 'V', 'WMT', 'WBA', 'DIS'
]
wiki_symbols = ['WIKI/%s'%symbol for symbol in SYMBOLS]
df_components = quandl.get(
wiki_symbols,
start_date='2017-01-01',
end_date='2017-12-31',
column_index=11)
df_components.columns = SYMBOLS
filled_df_components = df_components.fillna(method='ffill')
daily_df_components = filled_df_components.resample('24h').ffill()
daily_df_components = daily_df_components.fillna(method='bfill')
"""
Download the all-time DJIA dataset
"""
from alpha_vantage.timeseries import TimeSeries
# Update your Alpha Vantage API key here...
ALPHA_VANTAGE_API_KEY = 'MYKEY'
ts = TimeSeries(key=ALPHA_VANTAGE_API_KEY, output_format='pandas')
df, meta_data = ts.get_intraday(symbol='DIA',interval='1min', outputsize='full')
# Finding eigenvectors and eigen values
fn_weighted_average = lambda x: x/x.sum()
weighted_values = fn_weighted_average(fitted_pca.lambdas_)[:5]
from sklearn.decomposition import KernelPCA
fn_z_score = lambda x: (x - x.mean())/x.std()
df_z_components = daily_df_components.apply(fn_z_score)
fitted_pca = KernelPCA().fit(df_z_components)
# Reconstructing the Dow Average with PCA
import numpy as np
kernel_pca = KernelPCA(n_components=5).fit(df_z_components)
pca_5 = kernel_pca.transform(-daily_df_components)
weights = fn_weighted_average(kernel_pca.lambdas_)
reconstructed_values = np.dot(pca_5, weights)
# Combine PCA and Index to compare
df_combined = djia_2020_weird.copy()
df_combined['pca_5'] = reconstructed_values
But it returns:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-100-2808dc14f789> in <module>()
9 # Combine PCA and Index to compare
10 df_combined = djia_2020_weird.copy()
---> 11 df_combined['pca_5'] = reconstructed_values
12 df_combined = df_combined.apply(fn_z_score)
13 df_combined.plot(figsize=(12,8));
3 frames
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/construction.py in sanitize_index(data, index)
746 if len(data) != len(index):
747 raise ValueError(
--> 748 "Length of values "
749 f"({len(data)}) "
750 "does not match length of index "
ValueError: Length of values (361) does not match length of index (14)
Indeed, reconstructed_values is 361 long and df_combined is 14 values long...
Here is this last dataframe:
DJI
date
2021-01-21 NaN
2021-01-22 311.37
2021-01-23 310.03
2021-01-24 310.03
2021-01-25 310.03
2021-01-26 309.01
2021-01-27 309.49
2021-01-28 302.17
2021-01-29 305.25
2021-01-30 299.20
2021-01-31 299.20
2021-02-01 299.20
2021-02-02 302.13
2021-02-03 307.86
Maybe the reason is that the notebook author was available to get the data for the whole year he was interested in, when I run the data it seems that I only have two months?
Ahoy there, I'm the author of the notebook. It seems Quandl no longer provides historical prices of DJIA after the time of writing, and copyright wasn't granted to redistribute the data. For research, you may consider other free stock tickers to proxy DJIA.
The example usages have been updated in the repo to demostrate KernelPCA, as explained here.

KeyError: "None of [Index(['23/01/2020' ......,\n dtype='object', length=9050)] are in the [columns]"

I am learning pandas and matplotlib on my own by using some public dataset via
this api link
I'm using colab and below are my codes:
import datetime
import io
import json
import pandas as pd
import requests
import matplotlib.pyplot as plt
confirm_resp = requests.get('https://api.data.gov.hk/v2/filterq=%7B%22resource%22%3A%22http%3A%2F%2Fwww.chp.gov.hk%2Ffiles%2Fmisc%2Fenhanced_sur_covid_19_eng.csv%22%2 C%22section%22%3A1%2C%22format%22%3A%22json%22%7D').content
confirm_df = pd.read_json(io.StringIO(confirm_resp.decode('utf-8')))
confirm_df.columns = confirm_df.columns.str.replace(" ", "_")
pd.to_datetime(confirm_df['Report_date'])
confirm_df.columns = ['Case_no', 'Report_date', 'Onset_date', 'Gender', 'Age',
'Name_of_hospital_admitted', 'Status', 'Resident', 'Case_classification', 'Confirmed_probable']
confirm_df = confirm_df.drop('Name_of_hospital_admitted', axis = 1)
confirm_df.head()
and this is what the dataframe looks like:
Case_no
Report_date
Onset_date
Gender
Age
Status
Resident
Case_classification
Confirmed_probable
1
23/01/2020
21/01/2020
M
39
Discharged
Non-HK resident
Imported case
Confirmed
2
23/01/2020
18/01/2020
M
56
Discharged
HK resident
Imported case
Confirmed
3
24/01/2020
20/01/2020
F
62
Discharged
Non-HK resident
Imported case
Confirmed
4
24/01/2020
23/01/2020
F
62
Discharged
Non-HK resident
Imported case
Confirmed
5
24/01/2020
23/01/2020
M
63
Discharged
Non-HK resident
Imported case
Confirmed
When I try to make a simple plot with the below code:
x = confirm_df['Report_date']
y = confirm_df['Case_classification']
confirm_df.plot(x, y)
It gives me the below error:
KeyError Traceback (most recent call last)
<ipython-input-17-e4139a9b5ef1> in <module>()
4 y = confirm_df['Case_classification']
5
----> 6 confirm_df.plot(x, y)
3 frames
/usr/local/lib/python3.6/dist-packages/pandas/plotting/_core.py in __call__(self, *args, **kwargs)
912 if is_integer(x) and not data.columns.holds_integer():
913 x = data_cols[x]
--> 914 elif not isinstance(data[x], ABCSeries):
915 raise ValueError("x must be a label or position")
916 data = data.set_index(x)
/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in __getitem__(self, key)
2910 if is_iterator(key):
2911 key = list(key)
-> 2912 indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
2913
2914 # take() does not accept boolean indexers
/usr/local/lib/python3.6/dist-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1252 keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
1253
-> 1254 self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing)
1255 return keyarr, indexer
1256
/usr/local/lib/python3.6/dist-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1296 if missing == len(indexer):
1297 axis_name = self.obj._get_axis_name(axis)
-> 1298 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
1299
1300 # We (temporarily) allow for some missing keys with .loc, except in
KeyError: "*None of [Index(['23/01/2020', '23/01/2020', '24/01/2020', '24/01/2020', '24/01/2020',\n '26/01/2020', '26/01/2020', '26/01/2020', '29/01/2020', '29/01/2020',\n ...\n '05/01/2021', '05/01/2021', '05/01/2021', '05/01/2021', '05/01/2021',\n '05/01/2021', '05/01/2021', '05/01/2021', '05/01/2021', '05/01/2021'],\n dtype='object', length=9050)] are in the [column*s]"
I have tried to make the plot with and without converting Report date to datetime object, I tried substitute x value with all the columns in the data frame, but all give me the same error code.
Appreciate if anyone can help me to understand how to handle these issues here and going forward. I've spent hours to resolve it but cannot find the answers.
I did not encounter this issue before when I downloaded some notebooks and datasets from Kaggle to follow along.
Thank you and happy new year.
First, you need to assign the converted date back to the column:
confirm_df['Report_date'] = pd.to_datetime(confirm_df['Report_date'])
Second, When the plot method is called from a dataframe object, you need to provide only the column names as argument (1).
confirm_df.plot(x='Report_date', y='Case_classification')
But the above code still throws error because 'Case_classification' is not numeric data.
You are trying to plot datetime vs. categorical data, so normal plot won't work but Something like this could work (2):
# I used only first 15 examples here, full dataset is kinda messy
confirm_df.iloc[:15, :].groupby(['Report_date', 'Case_classification']).size().unstack().plot.bar()
(1)pandas.DataFrame.plot
(2)How to plot categorical variable against a date column in Python
Several problems. First, the links were incorrect, I have edited them (probably just a copy/paste error). Second, you have to assign the converted datetime series back to the dataframe. Use print(confirm_df.dtypes) to see the difference. Then, the dataset is not ordered by date, but matplotlib expects an ordered x-axis. Well, actually, the problem was that the parser misinterpreted the datetime objects. I have added dayfirst=True to ensure that the dates are read correctly. Finally, what do you want to plot here? Just the cases by date? The number of cases per group by date? Your original code implies just the former but this is not really informative, is it?
import io
import pandas as pd
import requests
import matplotlib.pyplot as plt
print("starting download")
confirm_resp = requests.get('https://api.data.gov.hk/v2/filter?q=%7B%22resource%22%3A%22http%3A%2F%2Fwww.chp.gov.hk%2Ffiles%2Fmisc%2Fenhanced_sur_covid_19_eng.csv%22%2C%22section%22%3A1%2C%22format%22%3A%22json%22%7D').content
print("finished download")
confirm_df = pd.read_json(io.StringIO(confirm_resp.decode('utf-8')))
confirm_df.columns = confirm_df.columns.str.replace(" ", "_")
confirm_df['Report_date'] = pd.to_datetime(confirm_df['Report_date'], dayfirst=True)
confirm_df.columns = ['Case_no', 'Report_date', 'Onset_date', 'Gender', 'Age',
'Name_of_hospital_admitted', 'Status', 'Resident', 'Case_classification', 'Confirmed_probable']
confirm_df = confirm_df.drop('Name_of_hospital_admitted', axis = 1)
print(confirm_df.dtypes)
fig, ax = plt.subplots(figsize=(20, 5))
ax.plot(confirm_df['Report_date'], confirm_df['Case_classification'])
plt.tight_layout()
plt.show()
Sample output:
Some grouping and data aggregation might be more informative, but you have to decide what you want to display first before writing the code.

How fix valueerror when using reshape function in python

I am trying to extract chlorophyll from a monthly averaged chl .nc satellite file (processed L2 to L3 product) which has geographic boundary coordinates latitude -25 S to -12 N and longitude -179 W to -169 E and there 1443*1110 floating point array elements. I have to write a python code that will let me input my latitude and longitude and python will print out the corresponding chl value found in that location.
Here are starting lines of my code and the error is in reshape lines.
#!/usr/bin/env python
import numpy as np
import pylab
#input user's selection of latitudes and longitudes points
lat = float(input('enter your latitude here: '))
print 'lat: ',lat
lon = float(input('enter your longitude here: '))
print 'lon: ',lon
#Call path directory, call file name, call variable 'fromfile'
path = '/home/rst/data/final_data/2002_chl_sst/Jul/Output-L3/monthly /1000m-chlor_a/mean/A20021822002212.MON.chlor_a.map.nc'
fname = open(path)
chl = np.fromfile(fname, dtype=np.float64)
fname.close()
#reshape the data into 2D matrix array
nrow = 1443.0
ncol = 1110.0
chl = chl.reshape([nrow,ncol]) # this is line 21
print chl.shape
It's the first error in my code, the rest of my code is not shown here. What is the total of this new array ? Appreciate any help, thanks
Traceback (most recent call last):
File "prob1_extract_chl_2002JUL.py", line 21, in
chl = chl.reshape([nrow,ncol])
ValueError: total size of new array must be unchanged

Applying function to pandas dataframe

I have a pandas dataframe called 'tourdata' consisting of 676k rows of data. Two of the columns are latitude and longitude.
Using the reverse_geocode package I want to convert these coordinates to a country data.
When I call :
import reverse_geocode as rg
tourdata['Country'] = rg.search((row[tourdata['latitude']],row[tourdata['longitude']]))
I get the error :
ValueErrorTraceback (most recent call last)
in ()
1 coordinates = (tourdata['latitude'],tourdata['longitude']),
----> 2 tourdata['Country'] = rg.search((row[tourdata['latitude']],row[tourdata['longitude']]))
~/anaconda/envs/py3/lib/python3.6/site-packages/reverse_geocode/init.py
in search(coordinates)
114 """
115 gd = GeocodeData()
--> 116 return gd.query(coordinates)
117
118
~/anaconda/envs/py3/lib/python3.6/site-packages/reverse_geocode/init.py
in query(self, coordinates)
46 except ValueError as e:
47 logging.info('Unable to parse coordinates: {}'.format(coordinates))
---> 48 raise e
49 else:
50 results = [self.locations[index] for index in indices]
~/anaconda/envs/py3/lib/python3.6/site-packages/reverse_geocode/init.py
in query(self, coordinates)
43 """
44 try:
---> 45 distances, indices = self.tree.query(coordinates, k=1)
46 except ValueError as e:
47 logging.info('Unable to parse coordinates: {}'.format(coordinates))
ckdtree.pyx in scipy.spatial.ckdtree.cKDTree.query()
ValueError: x must consist of vectors of length 2 but has shape (2,
676701)
To test that the package is working :
coordinates = (tourdata['latitude'][0],tourdata['longitude'][0]),
results = (rg.search(coordinates))
print(results)
Outputs :
[{'country_code': 'AT', 'city': 'Wartmannstetten', 'country': 'Austria'}]
Any help with this appreciated. Ideally I'd like to access the resulting dictionary and apply only the country code to the Country column.
The search method expects a list of coordinates. To obtain a single data point you can use "get" method.
Try :
tourdata['country'] = tourdata.apply(lambda x: rg.get((x['latitude'], x['longitude'])), axis=1)
It works fine for me :
import pandas as pd
tourdata = pd.DataFrame({'latitude':[0.3, 2, 0.6], 'longitude':[12, 5, 0.8]})
tourdata['country'] = tourdata.apply(lambda x: rg.get((x['latitude'], x['longitude'])), axis=1)
tourdata['country']
Output :
0 {'country': 'Gabon', 'city': 'Booué', 'country...
1 {'country': 'Sao Tome and Principe', 'city': '...
2 {'country': 'Ghana', 'city': 'Mumford', 'count...
Name: country, dtype: object

Resources