polars how to I read json lines from s3

polars how to I read json lines from s3 - rust

I am at a loss as to which polars interface I would pass s3_bytes from json lines file to get a dataframe
impl S3_Operation {
pub async fn new(file: File) -> Self {
let config = aws_config::load_from_env().await;
let client = Client::new(&config);
S3_Operation { client, file }
}
pub async fn download_object(&self) -> Cursor<Bytes> {
let resp = self
.client
.get_object()
.bucket(&self.file.bucket)
.key(&self.file.key)
.send()
.await;
let s3_bytes = resp.unwrap().body.collect().await.unwrap().into_bytes();
I want to get a polars dataframe at the end of day without moving s3_bytes to local storage. What am I missing here. JsonReader has no new fn while LazyJsonLineReader needs a path ...

Related

Rust with Datafusion - Trying to Write DataFrame to Json

*Repo w/ WIP code: https://github.com/jmelm93/rust-datafusion-csv-processing
Started programming with Rust 2 days ago, and have been trying to resolve this since ~3 hours into trying out Rust...
Any help would be appreciated.
My goal is to write a Dataframe from Datafusion to JSON (which will eventually be used to respond to HTTP requests in an API with the JSON string).
The DataFrame turns into an "datafusion::arrow::record_batch::RecordBatch" when you collect the data, and this data type is what I'm having trouble converting.
I've tried -
Using json::writer::record_batches_to_json_rows from Arrow, but it won't let me due to "struct datafusion::arrow::record_batch::RecordBatch and struct arrow::record_batch::RecordBatch have similar names, but are actually distinct types". Haven't been able to successfully convert the types to avoid this.
I tried during the Record Batch into a vec and pull out the headers and the values individually. I was able to get the headers out, but haven't had success with the values.
let mut header = Vec::new();
// let mut rows = Vec::new();
for record_batch in data_vec {
// get data
println!("record_batch.columns: : {:?}", record_batch.columns());
for col in record_batch.columns() {
for row in 0..col.len() {
// println!("Cow: {:?}", col);
// println!("Row: {:?}", row);
// let value = col.as_any().downcast_ref::<StringArray>().unwrap().value(row);
// rows.push(value);
}
}
// get headers
for field in record_batch.schema().fields() {
header.push(field.name().to_string());
}
};
Anyone know how to accomplish this?
The full script is below:
// datafusion examples: https://github.com/apache/arrow-datafusion/tree/master/datafusion-examples/examples
// datafusion docs: https://arrow.apache.org/datafusion/
use datafusion::prelude::*;
use datafusion::arrow::datatypes::{Schema};
use arrow::json;
// use serde::{ Deserialize };
use serde_json::to_string;
use std::sync::Arc;
use std::str;
use std::fs;
use std::ops::Deref;
type DFResult = Result<Arc<DataFrame>, datafusion::error::DataFusionError>;
struct FinalObject {
schema: Schema,
// columns: Vec<Column>,
num_rows: usize,
num_columns: usize,
}
// to allow debug logging for FinalObject
impl std::fmt::Debug for FinalObject {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// write!(f, "FinalObject {{ schema: {:?}, columns: {:?}, num_rows: {:?}, num_columns: {:?} }}",
write!(f, "FinalObject {{ schema: {:?}, num_rows: {:?}, num_columns: {:?} }}",
// self.schema, self.columns, self.num_columns, self.num_rows)
self.schema, self.num_columns, self.num_rows)
}
}
fn create_or_delete_csv_file(path: String, content: Option<String>, operation: &str) {
match operation {
"create" => {
match content {
Some(c) => fs::write(path, c.as_bytes()).expect("Problem with writing file!"),
None => println!("The content is None, no file will be created"),
}
}
"delete" => {
// Delete the csv file
fs::remove_file(path).expect("Problem with deleting file!");
}
_ => println!("Invalid operation"),
}
}
async fn read_csv_file_with_inferred_schema(file_name_string: String) -> DFResult {
// create string csv data
let csv_data_string = "heading,value\nbasic,1\ncsv,2\nhere,3".to_string();
// Create a temporary file
create_or_delete_csv_file(file_name_string.clone(), Some(csv_data_string), "create");
// Create a session context
let ctx = SessionContext::new();
// Register a lazy DataFrame using the context
let df = ctx.read_csv(file_name_string.clone(), CsvReadOptions::default()).await.expect("An error occurred while reading the CSV string");
// return the dataframe
Ok(Arc::new(df))
}
#[tokio::main]
async fn main() {
let file_name_string = "temp_file.csv".to_string();
let arc_csv_df = read_csv_file_with_inferred_schema(file_name_string.clone()).await.expect("An error occurred while reading the CSV string (funct: read_csv_file_with_inferred_schema)");
// have to use ".clone()" each time I want to use this ref
let deref_df = arc_csv_df.deref();
// print to console
deref_df.clone().show().await.expect("An error occurred while showing the CSV DataFrame");
// collect to vec
let record_batches = deref_df.clone().collect().await.expect("An error occurred while collecting the CSV DataFrame");
// println!("Data: {:?}", data);
// record_batches == <Vec<RecordBatch>>. Convert to RecordBatch
let record_batch = record_batches[0].clone();
// let json_string = to_string(&record_batch).unwrap();
// let mut writer = datafusion::json::writer::RecordBatchJsonWriter::new(vec![]);
// writer.write(&record_batch).unwrap();
// let json_rows = writer.finish();
let json_rows = json::writer::record_batches_to_json_rows(&[record_batch]);
println!("JSON: {:?}", json_rows);
// get final values from recordbatch
// https://docs.rs/arrow/latest/arrow/record_batch/struct.RecordBatch.html
// https://users.rust-lang.org/t/how-to-use-recordbatch-in-arrow-when-using-datafusion/70057/2
// https://github.com/apache/arrow-rs/blob/6.5.0/arrow/src/util/pretty.rs
// let record_batches_vec = record_batches.to_vec();
let mut header = Vec::new();
// let mut rows = Vec::new();
for record_batch in data_vec {
// get data
println!("record_batch.columns: : {:?}", record_batch.columns());
for col in record_batch.columns() {
for row in 0..col.len() {
// println!("Cow: {:?}", col);
// println!("Row: {:?}", row);
// let value = col.as_any().downcast_ref::<StringArray>().unwrap().value(row);
// rows.push(value);
}
}
// get headers
for field in record_batch.schema().fields() {
header.push(field.name().to_string());
}
};
// println!("Header: {:?}", header);
// Delete temp csv
create_or_delete_csv_file(file_name_string.clone(), None, "delete");
}

I am not sure that Datafusion is the perfect place to convert CSV string into JSON string, however here is a working version of your code:
#[tokio::main]
async fn main() {
let file_name_string = "temp_file.csv".to_string();
let csv_data_string = "heading,value\nbasic,1\ncsv,2\nhere,3".to_string();
// Create a temporary file
create_or_delete_csv_file(file_name_string.clone(), Some(csv_data_string), "create");
// Create a session context
let ctx = SessionContext::new();
// Register the csv file
ctx.register_csv("t1", &file_name_string, CsvReadOptions::new().has_header(false))
.await.unwrap();
let df = ctx.sql("SELECT * FROM t1").await.unwrap();
// collect to vec
let record_batches = df.collect().await.unwrap();
// get json rows
let json_rows = datafusion::arrow::json::writer::record_batches_to_json_rows(&record_batches[..]).unwrap();
println!("JSON: {:?}", json_rows);
// Delete temp csv
create_or_delete_csv_file(file_name_string.clone(), None, "delete");
}
If you encounter arrow and datafusion struct conflicts, use datafusion::arrow instead of just the arrow library.

Using ndarray to create a time series in rust

I was wondering how would I create a time series Array from CSV using ndarray ?
I have this CSV:
date,value
1959-07-02,0.2930
1959-07-06,0.2910
1959-07-07,0.2820
1959-07-08,0.2846
1959-07-09,0.2760
1959-07-10,0.2757
That I'd like to plot using plotly-rs with ndarray support. I deserialized the CSV successfully, but I know want to know how can I create two Array objects: one with dates as NaiveDate (or String as I'm not sure that plotly-rs supports NaiveData natively), and another with values as f64 ? Below is my deserialization code:
#[derive(Deserialize)]
struct Record {
#[serde(deserialize_with = "naive_date_time_from_str")]
date: NaiveDate,
value: f64
}
fn naive_date_time_from_str<'de, D>(deserializer: D) -> Result<NaiveDate, D::Error>
where
D: Deserializer<'de>,
{
let s: String = Deserialize::deserialize(deserializer)?;
NaiveDate::parse_from_str(&s, "%Y-%m-%d").map_err(de::Error::custom)
}
And I can iterate through the CSV like this:
fn main() -> Result<(), Box<dyn Error>> {
let mut reader = ReaderBuilder::new()
.has_headers(true)
.delimiter(b',')
.from_path("./data/timeseries.csv")?;
for record in reader.deserialize::<Record>() {
let record: Record = record?;
println!(
"date {}, value = {}",
record.date.format("%Y-%m-%d").to_string(),
record.value
);
}
Ok(())
}
But know I'm stuck at creating two ndarray Array object. Any hints ?
EDIT: A somewhat similar approach would be done in this topic (but without using ndarray): How to push data from a csv::StringRecord to each column vector in a struct?

You can directly read csv data and plot a chart without additional ndarray step.
use csv::Error;
use plotly::{Plot, Scatter};
fn main() -> Result<(), Error> {
let csv = "date,value
1959-07-02,0.2930
1959-07-06,0.2910
1959-07-07,0.2820
1959-07-08,0.2846
1959-07-09,0.2760
1959-07-10,0.2757";
let mut reader = csv::Reader::from_reader(csv.as_bytes());
let mut date = vec![];
let mut data = vec![];
for record in reader.records() {
let record = record?;
date.push(record[0].to_string());
data.push(record[1].to_string());
}
let trace = Scatter::new(date, data);
let mut plot = Plot::new();
plot.add_trace(trace);
plot.show();
Ok(())
}

how to generalize from `File` to `Read`?

I have some working code that reads a file, but I need to generalize it to pull data from additional sources other than simple disk files.
Is Read the correct generalization I should work with in order to replace File?
If so, how can I fix example2 in the following sample code? As is, it fails with the compile error dyn async_std::io::Read cannot be unpinned at the commented line. If not, what type should I return instead from get_read and are there any corresponding changes required in example2?
//! [dependencies]
//! tokio = { version = "1.0.1", features = ["full"] }
//! async-std = "1.8.0"
//! anyhow = "1.0.32"
use async_std::io::prelude::*;
use async_std::fs::File;
use anyhow::Result;
#[tokio::main]
async fn main() -> Result<()> {
example1().await?;
example2().await?;
Ok(())
}
// Example of consuming `File` ... works great!
async fn example1() -> Result<()> {
let mut file = get_file().await?;
let mut contents = String::new();
let _ = file.read_to_string(&mut contents).await?;
println!("read {} characters", contents.len());
Ok(())
}
// Example of consuming `Read` ... does not compile?
async fn example2() -> Result<()> {
let mut read = get_read().await?;
let mut contents = String::new();
// ERROR: `dyn async_std::io::Read` cannot be unpinned
let _ = read.read_to_string(&mut contents).await?;
println!("read {} characters", contents.len());
Ok(())
}
async fn get_read() -> Result<Box<dyn Read>> {
let file = get_file().await?;
Ok(Box::new(file))
}
async fn get_file() -> Result<File> {
let file = File::open("/etc/hosts").await?;
Ok(file)
}

You need to pin:
async fn get_read() -> Result<Pin<Box<dyn Read>>> {
let file = get_file().await?;
Ok(Box::pin(file))
}
Box<File> (without Pin) works because File implements Unpin. Box<dyn Read + Unpin> would work too.

How to use actix field stream by two consumers?

I have an actix web service and would like to parse the contents of a multipart field while streaming with async-gcode and in addition store the contents e.g. in a database.
However, I have no clue how to feed in the stream to the Parser and at the same time collect the bytes into a Vec<u8> or a String.
The first problem I face is that field is a stream of actix::web::Bytes and not of u8.
#[post("/upload")]
pub async fn upload_job(
mut payload: Multipart,
) -> Result<HttpResponse, Error> {
let mut contents : Vec<u8> = Vec::new();
while let Ok(Some(mut field)) = payload.try_next().await {
let content_disp = field.content_disposition().unwrap();
match content_disp.get_name().unwrap() {
"file" => {
while let Some(chunk) = field.next().await {
contents.append(&mut chunk.unwrap().to_vec());
// already parse the contents
// and additionally store contents somewhere
}
}
_ => (),
}
}
Ok(HttpResponse::Ok().finish())
}
Any hint or suggestion is very much appreciated.

One of the options is to wrap field in a struct and implement Stream trait for it.
use actix_web::{HttpRequest, HttpResponse, Error};
use futures_util::stream::Stream;
use std::pin::Pin;
use actix_multipart::{Multipart, Field};
use futures::stream::{self, StreamExt};
use futures_util::TryStreamExt;
use std::task::{Context, Poll};
use async_gcode::{Parser, Error as PError};
use bytes::BytesMut;
use std::cell::RefCell;
pub struct Wrapper {
field: Field,
buffer: RefCell<BytesMut>,
index: usize,
}
impl Wrapper {
pub fn new(field: Field, buffer: RefCell<BytesMut>) -> Self {
buffer.borrow_mut().truncate(0);
Wrapper {
field,
buffer,
index: 0
}
}
}
impl Stream for Wrapper {
type Item = Result<u8, PError>;
fn poll_next(
mut self: Pin<&mut Self>,
cx: &mut Context<'_>,
) -> Poll<Option<Result<u8, PError>>> {
if self.index == self.buffer.borrow().len() {
match Pin::new(&mut self.field).poll_next(cx) {
Poll::Ready(Some(Ok(chunk))) => self.buffer.get_mut().extend_from_slice(&chunk),
Poll::Pending => return Poll::Pending,
Poll::Ready(None) => return Poll::Ready(None),
Poll::Ready(Some(Err(_))) => return Poll::Ready(Some(Err(PError::BadNumberFormat/* ??? */)))
};
} else {
let b = self.buffer.borrow()[self.index];
self.index += 1;
return Poll::Ready(Some(Ok(b)));
}
Poll::Ready(None)
}
}
#[post("/upload")]
pub async fn upload_job(
mut payload: Multipart,
) -> Result<HttpResponse, Error> {
while let Ok(Some(field)) = payload.try_next().await {
let content_disp = field.content_disposition().unwrap();
match content_disp.get_name().unwrap() {
"file" => {
let mut contents: RefCell<BytesMut> = RefCell::new(BytesMut::new());
let mut w = Wrapper::new(field, contents.clone());
let mut p = Parser::new(w);
while let Some(res) = p.next().await {
// Do something with results
};
// Do something with the buffer
let a = contents.get_mut()[0];
}
_ => (),
}
}
Ok(HttpResponse::Ok().finish())
}
Copying the Bytes from the Field won't be necessary when
Bytes::try_unsplit will be implemented. (https://github.com/tokio-rs/bytes/issues/287)

The answer from dmitryvm (thanks for your effort) showed me that there are actually two problems. At first, flatten the Bytes into u8's and, secondly, to "split" the stream into a buffer for later storage and the async-gcode parser.
This shows how I solved it:
#[post("/upload")]
pub async fn upload_job(
mut payload: Multipart,
) -> Result<HttpResponse, Error> {
let mut contents : Vec<u8> = Vec::new();
while let Ok(Some(mut field)) = payload.try_next().await {
let content_disp = field.content_disposition().unwrap();
match content_disp.get_name().unwrap() {
"file" => {
let field_stream = field
.map_err(|_| async_gcode::Error::BadNumberFormat) // Translate error
.map_ok(|y| { // Translate Bytes into stream with Vec<u8>
contents.extend_from_slice(&y); // Copy and store for later usage
stream::iter(y).map(Result::<_, async_gcode::Error>::Ok)
})
.try_flatten(); // Flatten the streams of u8's
let mut parser = Parser::new(field_stream);
while let Some(gcode) = parser.next().await {
// Process result from parser
}
}
_ => (),
}
}
Ok(HttpResponse::Ok().finish())
}

How to copy data from a stream while also forwarding a stream

I am using hyper 0.12 to build a proxy service. When receiving a response body from the upstream server I want to forward it back to the client ASAP, and save the contents in a buffer for later processing.
So I need a function that:
takes a Stream (a hyper::Body, to be precise)
returns a Stream that is functionally identical to the input stream
also returns some sort of Future<Item = Vec<u8>, Error = ...> that is resolved with the buffered contents of the input stream, when the output stream is completely consumed
I can't for the life of me figure out how to do this.
I guess the function I'm looking for will look something like this:
type BufferFuture = Box<Future<Item = Vec<u8>, Error = ()>>;
pub fn copy_body(body: hyper::Body) -> (hyper::Body, BufferFuture) {
let body2 = ... // ???
let buffer = body.fold(Vec::<u8>::new(), |mut buf, chunk| {
buf.extend_from_slice(&chunk);
// ...somehow send this chunk to body2 also?
});
(body2, buffer);
}
Below is what I have tried, and it works until send_data() fails (obviously).
type BufferFuture = Box<Future<Item = Vec<u8>, Error = ()>>;
pub fn copy_body(body: hyper::Body) -> (hyper::Body, BufferFuture) {
let (mut sender, body2) = hyper::Body::channel();
let consume =
body.map_err(|_| ()).fold(Vec::<u8>::new(), move |mut buf, chunk| {
buf.extend_from_slice(&chunk);
// What to do if this fails?
if sender.send_data(chunk).is_err() {}
Box::new(future::ok(buf))
});
(body2, Box::new(consume));
}
However, something tells me I am on the wrong track.
I have found Sink.fanout() which seems like it is what I want, but I do not have a Sink, and I don't know how to construct one. hyper::Body implements Stream but not Sink.

What I ended up doing was implement a new type of stream that does what I need. This appeared to be necessary because hyper::Body does not implement Sink nor does hyper::Chunk implement Clone (which is required for Sink.fanout()), so I cannot use any of the existing combinators.
First a struct that contains all details that we need and methods to append a new chunk, as well as notify that the buffer is completed.
struct BodyClone<T> {
body: T,
buffer: Option<Vec<u8>>,
sender: Option<futures::sync::oneshot::Sender<Vec<u8>>>,
}
impl BodyClone<hyper::Body> {
fn flush(&mut self) {
if let (Some(buffer), Some(sender)) = (self.buffer.take(), self.sender.take()) {
if sender.send(buffer).is_err() {}
}
}
fn push(&mut self, chunk: &hyper::Chunk) {
use hyper::body::Payload;
let length = if let Some(buffer) = self.buffer.as_mut() {
buffer.extend_from_slice(chunk);
buffer.len() as u64
} else {
0
};
if let Some(content_length) = self.body.content_length() {
if length >= content_length {
self.flush();
}
}
}
}
Then I implemented the Stream trait for this struct.
impl Stream for BodyClone<hyper::Body> {
type Item = hyper::Chunk;
type Error = hyper::Error;
fn poll(&mut self) -> futures::Poll<Option<Self::Item>, Self::Error> {
match self.body.poll() {
Ok(Async::Ready(Some(chunk))) => {
self.push(&chunk);
Ok(Async::Ready(Some(chunk)))
}
Ok(Async::Ready(None)) => {
self.flush();
Ok(Async::Ready(None))
}
other => other,
}
}
}
Finally I could define an extension method on hyper::Body:
pub type BufferFuture = Box<Future<Item = Vec<u8>, Error = ()> + Send>;
trait CloneBody {
fn clone_body(self) -> (hyper::Body, BufferFuture);
}
impl CloneBody for hyper::Body {
fn clone_body(self) -> (hyper::Body, BufferFuture) {
let (sender, receiver) = futures::sync::oneshot::channel();
let cloning_stream = BodyClone {
body: self,
buffer: Some(Vec::new()),
sender: Some(sender),
};
(
hyper::Body::wrap_stream(cloning_stream),
Box::new(receiver.map_err(|_| ())),
)
}
}
This can be used as follows:
let (body: hyper::Body, buffer: BufferFuture) = body.clone_body();

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

polars how to I read json lines from s3 - rust

Related

Rust with Datafusion - Trying to Write DataFrame to Json

Using ndarray to create a time series in rust

how to generalize from `File` to `Read`?

How to use actix field stream by two consumers?

How to copy data from a stream while also forwarding a stream

Categories

Resources