Extract area from high resolution netcdf file python - python-3.x

I am trying to extract an area from a netcdf file by longitude and latitude.
However the resolution is much higher than 1x1 degree.
How would you extract an area then, e.g. lon: 30-80 and lat: 30-40.
The file can be found here: https://drive.google.com/open?id=1zX-qYBdXT_GuktC81NoQz9xSxSzM-CTJ
Keys and shapes are as follows:
odict_keys(['crs', 'lat', 'lon', 'Band1'])
crs ()
lat (25827,)
lon (35178,)
Band1 (25827, 35178)
I have tried this, but with the high resolution, it doesn't refer to the actual longitude/langitude.
from netCDF4 import Dataset
import numpy as np
import matplotlib.pyplot as plt
file = path + '20180801-ESACCI-L3S_FIRE-BA-MODIS-AREA_3-fv5.1-JD.nc'
fh = Dataset(file)
longitude = fh.variables['lon'][:]
latitude = fh.variables['lat'][:]
band1 = fh.variables['Band1'][:30:80,30:40]

since you have variables(dimensions): ..., int16 Band1(lat,lon), you could apply np.where to variables lat and lon to find the appropriate indices and then select the according Band1 data as sel_band1:
import numpy as np
from netCDF4 import Dataset
file = '20180801-ESACCI-L3S_FIRE-BA-MODIS-AREA_3-fv5.1-JD.nc'
with Dataset(file) as nc_obj:
lat = nc_obj.variables['lat'][:]
lon = nc_obj.variables['lon'][:]
sel_lat, sel_lon = [30, 40], [30, 80]
sel_lat_idx = np.where((lat >= sel_lat[0]) & (lat <= sel_lat[1]))
sel_lon_idx = np.where((lon >= sel_lon[0]) & (lon <= sel_lon[1]))
sel_band1 = nc_obj.variables['Band1'][:][np.ix_(sel_lat_idx[0], sel_lon_idx[0])]
note that np.where applied to lat and lon returns 1D index arrays. Use np.ix_ to apply them to the 2D data in Band1. See here for more info.

Related

creating a rainfall colormap for points inside a watershed polygon

I really appreciate your help in developing my code since I am not an expert in python. I attempt to write a code to be able to:
Read all the points (longitude, latitude, cumulative forecasted rainfall for 24, 48, and 72 hours) from a csv file (Mean_PCP_REPS_12_20220809_Gridded.csv).
Read the polygon representing the watershed boundary (NelsonRiverBasin.shp).
Mask/remove the points outside of the watershed polygon.
Create a rainfall colormap image or raster for the points inside the watershed polygon.
Color boundaries should be based on rainfall value. I defined the rainfall range for each color in my code.
I tried many ways but I was not successful in creating an image or raster with desired color map (please click here as an example of the intended image). My python code is as follows. It creates and saves "New_ras.tiff" but my code cannot remap the colors of this image based on the range of rainfall after its creation.
from __future__ import division
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point, Polygon, MultiPolygon
import operator
#extending the code
import os
from matplotlib.patches import Patch
from matplotlib.colors import ListedColormap
import matplotlib.colors as colors
import seaborn as sns
import numpy as np
import rioxarray as rxr
import earthpy as et
import earthpy.plot as ep
from scipy.interpolate import griddata #added code up to here
import rasterio
# load the data that should be cropped by the polygon
# this assumes that the csv file already includes
# a geometry column with point data as performed below
dat_gpd = pd.read_csv(r'Mean_PCP_REPS_12_20220809_Gridded.csv')
# make shapely points out of the X and Y coordinates
point_data = [Point(xy) for xy in zip(dat_gpd.iloc[:,0], dat_gpd.iloc[:,1])]
all_pts = list(zip(dat_gpd.iloc[:,0], dat_gpd.iloc[:,1]))
# assign shapely points as geometry to a geodataframe
# Like this you can also inspect the individual points if needed
arr_gpd = gpd.GeoDataFrame(dat_gpd, crs=4269, geometry=point_data)
# assign defined polygon to a new dataframe
nlpoly = gpd.read_file('NelsonRiverBasin.shp')
nlpoly = nlpoly.to_crs('epsg:4269')
mask = [nlpoly.contains(Point(p)).any() for p in all_pts]
# define a new dataframe from the spatial join of the dataframe with the data to be cropped
# and the dataframe with the polygon data, using the within function.
#dat_fin = gpd.sjoin(arr_gpd, nlpoly[['OCEAN_EN', 'COUNT', 'geometry']], predicate = 'within')
#dat_fin = dat_fin.to_crs('epsg:4326')
#dat_fin.plot(column= 'Hr72')
#plt.savefig('Raster2.tiff')
data = dat_gpd[['Long', 'Lat', 'Hr72']]
pts = list(zip(data.Long, data.Lat))
print (pts)
print(type(pts))
pts2 = [pts[i] for i in range(len(pts)) if mask[i]]
print(pts2)
print(type(pts2))
pts_val = data.Hr72.values
pts_val2 = [pts_val[i] for i in range(len(pts_val)) if mask[i]]
new_pts = [Point(xy) for xy in pts2]
print(type(pts_val2[1]))
pts3=[]
for tup, j in zip(pts2,range(len(pts_val2))):
pts3.append(list(tup)+[pts_val2[j]])
print(type(pts3))
masked_pts = pd.DataFrame(pts3)
print(masked_pts)
masked_pts.columns = pd.Series(['Long', 'Lat', 'Hr72'])
new_arr_gpd = gpd.GeoDataFrame(masked_pts, crs = 4269, geometry = new_pts)
new_arr_gpd.plot(column = 'Hr72')
plt.savefig('new_ras.tiff')
rRes = 0.01
#xRange = np.arange(data.Long.min(), data.Long.max(), rRes)
#yRange = np.arange(data.Lat.min(), data.Lat.max(), rRes)
#print(xRange[:5],yRange[:5])
#gridX, gridY = np.meshgrid(xRange, yRange)
#grid_pcp = griddata(pts2, pts_val2, (gridX, gridY), method = 'linear')
#Extending the code
sns.set(font_scale = 1, style = "white")
lidar_chm = rxr.open_rasterio(r'new_ras.tiff', masked=True).squeeze()
# Define the colors you want
cmap = ListedColormap(["white", "lightskyblue","dodgerblue","mediumblue","lawngreen","limegreen", "forestgreen","darkgreen", "yellow", "orange","darkorange", "chocolate", "red", "maroon", "indianred","lightpink", "pink", "lightgray", "whitesmoke" ])
# Define a normalization from values -> colors
norm = colors.BoundaryNorm([0, 1, 5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 150, 200, 250], 19)
fig, ax = plt.subplots(figsize=(9, 5))
chm_plot = ax.imshow(np.squeeze(r'new_ras.tiff'),cmap=cmap,norm=norm)
#print(chm_plot)
map_title = input ("Enter a title for this map (for ex. 72-hr accumulated forecast map):")
ax.set_title("Hydrologic Forecast Centre (MTI)\n" + map_title)
# Add a legend for labels
legend_labels = {"white": "0-1", "lightskyblue": "1-5","dodgerblue": "5-10","mediumblue": "10-15","lawngreen": "15-20","limegreen": "20-25", "forestgreen": "25-30","darkgreen": "30-40", "yellow": "40-50", "orange": "50-60","darkorange": "60-70", "chocolate": "70-80", "red": "80-90", "maroon": "90-100","indianred": "100-110", "lightpink": "110-120", "pink": "120-150", "lightgray": "150-200", "whitesmoke": "200-250"}
patches = [Patch(color=color, label=label) for color, label in legend_labels.items()]
ax.legend(handles=patches,bbox_to_anchor=(1.2, 1),facecolor="white")
ax.set_axis_off()
plt.show()

How can I achieve faster access to this netcdf?

I am selecting spatially and temporal data from this kind of NetCDF opened by
ds = xr.open_mfdataset(file_list):
<xarray.Dataset>
Dimensions: (lat: 576, lon: 1152, time: 1464)
Coordinates:
* lon (lon) float32 0.0 0.3125 0.625 0.9375 ... 359.0625 359.375 359.6875
* lat (lat) float32 89.761 89.4514 89.1399 ... -89.1399 -89.4514 -89.761
* time (time) datetime64[ns] 1980-04-01T01:00:00 ... 1980-06-01
Data variables:
uasmean (lat, lon, time) float32 dask.array<shape=(576, 1152, 1464), chunksize=(576, 1152, 720)>
vasmean (lat, lon, time) float32 dask.array<shape=(576, 1152, 1464), chunksize=(576, 1152, 720)>
Attributes:
Creator: NCAR - CISL RDA (dattore)
history: Mon Aug 11 12:24:36 2014: ncatted -a history,global,d,, -O Wind...
I achieved to get the correct subset in time and lon/lat using:
ds = ds.where((ds.time >= np.datetime64(date_ini)) & (ds.time <= np.datetime64(date_end)), drop=True)
ds = ds.where((ds.lon >= lonlat[0]) & (ds.lon <= lonlat[1]) & (ds.lat >= lonlat[2]) & (ds.lat <= lonlat[3]), drop=True)
And finally to extract this information in my target format I use a loop over the time to convert the information to a dataframe that I export to csv after:
# for t in ds['time']:
t = ds['time'][0]
# Select time and convert to dataframe
df = ds.sel(time=t).to_dataframe()
My problem is that the conversion to dataframe is slow and I know that the originals netCDF are written in order to optimize the extraction of temporal series instead of extracting maps as I am trying to do. I know that is possible to change the sort of coordinates and write a new netCDF in order to speed this up, but the database is a too big... so it is not an option. Do you know if there is any other way to speed up this extraction??
Thank you all in advance!!!
P.D.: I attached the complete script of this block of code I am using to check the performance...
import os
import random
import shutil
from datetime import datetime, timedelta
from glob import glob
import pandas as pd
import xarray as xr
import numpy as np
import scipy.io
import matplotlib.pyplot as plt
import time
start_time = time.time()
files = glob('*.nc')
lonlat = [-5, 10, 50, 64]
date_ini = datetime(1980, 4, 28)
date_end = datetime(1980, 5, 3)
ds = xr.open_mfdataset(files)
print('[Processing 2D winds]')
# create date list to loop over folders
dates = pd.date_range(start=date_ini - timedelta(days=1), end=date_end + timedelta(days=1), freq='D')
# Create date list of files to open
file_list = []
for date in dates:
file_list.append('Wind_CFS_Global_' + date.strftime('%Y.%m') + '.nc')
# Delete repeated elements
file_list = list(dict.fromkeys(file_list))
print(file_list)
# load data
ds = xr.open_mfdataset(file_list)
# Select temporal subset
ds = ds.get(['uasmean','vasmean'])
ds = ds.where((ds.time >= np.datetime64(date_ini)) & (ds.time <= np.datetime64(date_end)), drop=True)
# from 0º,360º to -180º,180º
ds['lon'] = (ds.lon + 180) % 360 - 180
ds = ds.sortby('lon', 'lat')
ds = ds.where((ds.lon >= lonlat[0]) & (ds.lon <= lonlat[1]) & (ds.lat >= lonlat[2]) & (ds.lat <= lonlat[3]), drop=True)
print(ds)
currents_list = []
# for t in ds['time']:
t = ds['time'][0]
# Select time and depth array
df = ds.sel(time=t).to_dataframe()
# reset index because longitude latitude are as multi-index and I want them as columns
df = df.reset_index()
# sort data-rows for TESEO: longitude, latitude (ascending)
df = df.sort_values(['lon', 'lat'])
# generate full file path
outfile = 'winds_' + df['time'][0].strftime('%Y%m%dT%H%M') + '.txt'
# export to ascii without separator, without header neither index column, replace nan by 0 and set 3 floating numbers
df.to_csv(path_or_buf=outfile,
sep=' ',
columns=['lon', 'lat', 'uasmean', 'vasmean'],
header=False,
index=False,
na_rep=0,
float_format='%.3f'
)
elapsed_time = (time.time() - start_time)
print('Elapsed time: {} sec.'.format(elapsed_time))
I found a big improvement in the performance doing this:
convert all the Xarray dataset to a dataframe
loop over time directly in the dataframe
That makes a really big difference! I was looping over time and converting this shorter dataset to dataframe that is really inefficient!
Best regards!

Convert a numpy dataset to netCDF

I have a numpy array in python with size (16,250,186) representing time, latitude and longitude.
I want to convert it to a netCDF file so that I can read the data easily with co-ordinates in future.
My numpy array looks like this
RZS = np.load("/home/chandra/Data/rootzone_CHIRPS_era5_2003-2015_daily-analysis_annual-result.npy")
RZS.shape
Output: (16, 250, 186)
As you can see my above numpy array represents annual values for 16 years.
chirps_precip =xarray.open_mfdataset("/home/chandra/Data/CHIRPS/chirps-v2.0.2000.days_p25.nc")
precip = chirps_precip.precip.sel(latitude = slice(-50,12.5), longitude = slice(-81.25,-34.75))
precip[0,:,:]
Output:
<xarray.DataArray 'precip' (latitude: 250, longitude: 186)>
dask.array<shape=(250, 186), dtype=float32, chunksize=(250, 186)>
Coordinates:
* latitude (latitude) float32 -49.875 -49.625 -49.375 ... 12.125 12.375
* longitude (longitude) float32 -81.125 -80.875 -80.625 ... -35.125 -34.875
time datetime64[ns] 2000-01-01
Attributes:
units: mm/day
standard_name: convective precipitation rate
long_name: Climate Hazards group InfraRed Precipitation with St...
time_step: day
geostatial_lat_min: -50.0
geostatial_lat_max: 50.0
geostatial_lon_min: -180.0
geostatial_lon_max: 180.0
These are the co-ordinates of the chirps_precip dataset that I want my numpy array RZS to have with years (as 2000, 2001, .....2015) on the timestep
I have tried some methods like
# from xarray
array = xarray.DataArray(RZS, latitude = 'precip.latitude')
#from netCDF
Dataset.createVariable('rootzone storage cap', np.float32, ('time','lat','lon'))
But I am not able to do anything. I also tried to copy attrs and coords but that also didn't work.
It seems like I am doing this the wrong way. Can anyone suggest what am I missing.
I want my numpy array to have the same co-ordinate as the netcdf file, but with a modified time attribute to years.
I would suggest a code like using module netCDF4, assuming you have latitude and longitude in variables lat and lon and dataout is dataout.
#!/usr/bin/env ipython
# ---------------------
import numpy as np
import datetime
from netCDF4 import Dataset,num2date,date2num
# -----------------------
nyears = 16;
unout = 'days since 2000-01-01 00:00:00'
# -----------------------
ny, nx = (250, 186)
lon = np.linspace(9,30,nx);
lat = np.linspace(50,60,ny);
dataout = np.random.random((nyears,ny,nx)); # create some random data
datesout = [datetime.datetime(2000+iyear,1,1) for iyear in range(nyears)]; # create datevalues
# =========================
ncout = Dataset('myfile.nc','w','NETCDF3'); # using netCDF3 for output format
ncout.createDimension('lon',nx);
ncout.createDimension('lat',ny);
ncout.createDimension('time',nyears);
lonvar = ncout.createVariable('lon','float32',('lon'));lonvar[:] = lon;
latvar = ncout.createVariable('lat','float32',('lat'));latvar[:] = lat;
timevar = ncout.createVariable('time','float64',('time'));timevar.setncattr('units',unout);timevar[:]=date2num(datesout,unout);
myvar = ncout.createVariable('myvar','float32',('time','lat','lon'));myvar.setncattr('units','mm');myvar[:] = dataout;
ncout.close();
Compared to xarray, you have to write more code, but it is still very easy to create the netCDF files using that module.

How can I count the number of polygons a shape intersects?

I have a very large dataset with a polygons and points with buffers around them. I would like to creat a new column in the points data which includes the number of polygons that point's buffer intersects.
Heres a simplified example:
import pandas as pd
import geopandas as gp
from shapely.geometry import Polygon
from shapely.geometry import Point
import matplotlib.pyplot as plt
## Create polygons and points ##
df = gp.GeoDataFrame([['a',Polygon([(1, 0), (1, 1), (2,2), (1,2)])],
['b',Polygon([(1, 0.25), (2,1.25), (3,0.25)])]],
columns = ['name','geometry'])
df = gp.GeoDataFrame(df, geometry = 'geometry')
points = gp.GeoDataFrame( [['box', Point(1.5, 1.115), 4],
['triangle', Point(2.5,1.25), 8]],
columns=['name', 'geometry', 'value'],
geometry='geometry')
##Set a buffer around the points##
buf = points.buffer(0.5)
points['buffer'] = buf
points = points.drop(['geometry'], axis = 1)
points = points.rename(columns = {'buffer': 'geometry'})
This data looks like this:
What I'd like to do is create another column in the points dataframe that includes the number of polygons that point intersects.
I've tried utilising a for loop as such:
points['intersect'] = []
for geo1 in points['geometry']:
for geo2 in df['geometry']:
if geo1.intersects(geo2):
points['intersect'].append('1')
Which I would then sum to get the total number of intersects.
However, I get the error: 'Length of values does not match length of index'. I know this is because it is attempting to assign three rows of data to a frame with only two rows.
How can I aggrigate the counts so the first point is assigned a value of 2 and the second a value of 1?
If you have large dataset, I would go for solution using rtree spatial index, something like this.
import pandas as pd
import geopandas as gp
from shapely.geometry import Polygon
from shapely.geometry import Point
import matplotlib.pyplot as plt
## Create polygons and points ##
df = gp.GeoDataFrame([['a',Polygon([(1, 0), (1, 1), (2,2), (1,2)])],
['b',Polygon([(1, 0.25), (2,1.25), (3,0.25)])]],
columns = ['name','geometry'])
df = gp.GeoDataFrame(df, geometry = 'geometry')
points = gp.GeoDataFrame( [['box', Point(1.5, 1.115), 4],
['triangle', Point(2.5,1.25), 8]],
columns=['name', 'geometry', 'value'],
geometry='geometry')
# generate spatial index
sindex = df.sindex
# define empty list for results
results_list = []
# iterate over the points
for index, row in points.iterrows():
buffer = row['geometry'].buffer(0.5) # buffer
# find approximate matches with r-tree, then precise matches from those approximate ones
possible_matches_index = list(sindex.intersection(buffer.bounds))
possible_matches = df.iloc[possible_matches_index]
precise_matches = possible_matches[possible_matches.intersects(buffer)]
results_list.append(len(precise_matches))
# add list of results as a new column
points['polygons'] = pd.Series(results_list)

How to create an accurate buffer of 5 miles around a coordinate in python?

I would like to create an accurate buffer of 5 miles around a coordinate, my current code is:
cpr_gdf['p_buffer']=cpr_gdf['coordinates'].buffer(5*(1/60))
The coordinates column was created with this code:
cpr_df['coordinates']=list(zip(cpr_df.sample_longitude_decimal,cpr_df.sample_latitude_decimal))
cpr_df['coordinates']=cpr_df['coordinates'].apply(Point)
cpr_gdf=gpd.GeoDataFrame(cpr_df,geometry='coordinates',crs={'init' :'epsg:4326'})
Thanks for any help!
You need to convert to an equal area projection that is most accurate to where your buffer will be (good resource at https://epsg.io/)
For example, I'm making maps in Michigan, so I'm using EPSG:3174 (which I believe is in meters, correct me if wrong). Given you've already converted your dataframe to a GeoPandas dataframe, you can convert your current projection to 3174 and then create your buffer (converting miles to meters)
cpr_gdf= cpr_gdf.to_crs({'init': 'epsg:3174'})
buffer_length_in_meters = (5 * 1000) * 1.60934
cpr_gdf['geometry'] = cpr_gdf.geometry.buffer(buffer_length_in_meters)
You can calculate buffer over points without converting to any other CRS using the function bellow. But it calculates in meters, so if you want to use miles just multiply distance on 1609.34
Here is an example
from geographiclib.geodesic import Geodesic
import numpy as np
from shapely.geometry import Polygon
import pandas as pd
import geopandas as gpd
def geod_buffer(gdf, distance, resolution=16, geod = Geodesic.WGS84):
"""
gdf - GeoDataFrame with geometry column
distance - The radius of the buffer in meters
resolution – The resolution of the buffer around each vertex
geod - Define an ellipsoid
"""
buffer = list()
for index, row in gdf.iterrows():
lon1, lat1 = row['geometry'].x, row['geometry'].y
buffer_ = list()
for azi1 in np.arange(0, 360, 90/resolution):
properties = geod.Direct(lat1, lon1, azi1, distance)
buffer_.append([properties['lon2'], properties['lat2']])
buffer.append(Polygon(buffer_))
return buffer
locations = pd.DataFrame([
{
'longitude': 54.604972,
'latitude': 18.346815},
{
'longitude': 54.605917,
'latitude': 18.347249}
])
locations_gpd = gpd.GeoDataFrame(locations,
geometry=gpd.points_from_xy(locations.longitude, locations.latitude),
crs='epsg:4326').drop(columns=['longitude', 'latitude'])
locations_gpd['geometry'] = geod_buffer(locations_gpd, 1000)
At the equator, one minute of latitude or longitude is ~ 1.84 km or 1.15 mi (ref).
So if you define your point as P = [y, x] then you can create a buffer around it of lets say 4 minutes which are approximately 5 miles: buffer = 0.04. The bounding box then is easily obtained with
minlat = P[0]-(P[0]*buffer)
maxlat = P[0]+(P[0]*buffer)
minlon = P[1]-(P[1]*buffer)
maxlon = P[1]+(P[1]*buffer)

Resources