Rust pattern matching with an existing binding? - rust

I am learning Rust from yesterday. The following code is simple --
use encoding_rs::Encoding;
use std::fs;
use std::fs::File;
use std::io::BufReader;
use std::io::Read;
use std::option::Option;
use std::path::Path;
extern crate encoding_rs;
extern crate encoding_rs_io;
fn main() {
let mut reader = BufReader::new(file);
let mut bom: [u8; 3] = [0; 3];
// read BOM
if let Ok(_) = reader.read_exact(&mut bom) {
// sniff BOM
// Because Rust disallows NULLs, hence I declare `Option<&Encoding>` to store the result of encoding.
let mut enc: Option<&Encoding> = None;
match Encoding::for_bom(&bom) {
Some((encoding, _)) => {
// <-- Some((enc, _))
enc = Some(encoding);
}
None => {
if let Some(encoding) = Encoding::for_label("UTF-8".as_bytes()) {
enc = Some(encoding);
}
}
}
if let Some(encoding) = enc {
println!("{:?}", encoding);
}
}
}
It opens a text file, and try to analyze its encoding by parsing BOM(Byte Order Marker). If Encoding::for_bom does not return an encoding, the code will take use UTF-8 as default.
I dislike unwrap() because it always assume there is a valid result
My question is : is there a way to do pattern matching and put the result directly into an existing mutable binding?
e.g. Change Some((encoding, _)) to Some((enc, _)) hence I don't need the line of enc = Some(encoding)

Many rust constructs can be used as expressions, i.e. they can return a value. So if every branch of your match returns a value of the same type, you can assign it directly into a variable. It does not need to be mutable unless you plan to change it later.
let mut reader = BufReader::new(file);
let mut bom: [u8; 3] = [0; 3];
if let Ok(_) = reader.read_exact(&mut bom) {
let enc = match Encoding::for_bom(&bom) {
Some((encoding, _)) => Some(encoding),
None => Encoding::for_label("UTF-8".as_bytes()),
};
if let Some(encoding) = enc {
println!("{:?}", encoding);
}
}

I'd use a combination of map and or_else:
let enc = Encoding::for_bom(&bom)
.map(|t| t.0)
.or_else(|| Encoding::for_label ("UTF-8".as_bytes()));
Or (clearer but slightly longer):
let enc = Encoding::for_bom(&bom)
.map(|(e, _)| e)
.or_else(|| Encoding::for_label ("UTF-8".as_bytes()));

Related

What is the most efficient way to read the first line of a file separately to the rest of the file?

I am trying to figure out the best way to read the contents of a file. The problem is that I need to read the first line separately, because I need that to be parsed as a usize which I need for the dimension of a Array2 by ndarray.
I tried the following:
use ndarray::prelude::*;
use std::io:{BufRead,BufReader};
use std::fs;
fn read_inputfile(geom_filename: &str) -> (Vec<i32>, Array2<f64>, usize) {
//* Step 1: Read the coord data from input
println!("Inputfile: {geom_filename}");
let geom_file = fs::File::open(geom_filename).expect("Geometry file not found!");
let geom_file_reader = BufReader::new(geom_file);
let geom_file_lines: Vec<String> = geom_file_reader
.lines()
.map(|line| line.expect("Failed to read line!"))
.collect();
//* Read no of atoms first for array size
let no_atoms: usize = geom_file_lines[0].parse().unwrap();
let mut Z_vals: Vec<i32> = Vec::new();
let mut geom_matr: Array2<f64> = Array2::zeros((no_atoms, 3));
for (atom_idx, line) in geom_file_lines[1..].iter().enumerate() {
//* into_iter would do the same
let line_split: Vec<&str> = line.split_whitespace().collect();
Z_vals.push(line_split[0].parse().unwrap());
(0..3).for_each(|cart_coord| {
geom_matr[(atom_idx, cart_coord)] = line_split[cart_coord + 1].parse().unwrap();
});
}
(Z_vals, geom_matr, no_atoms)
}
Does this not kind of defeat the purpose of the BufReader? I am still relative new to Rust, so I might have misunderstood something, but I thought that one uses the BufReader so that the whole file does not need to be read into memory.
With the Vec<String> for geom_file_lines I am mostlike loading the whole file into memory again, right?
Does this not kind of defeat the purpose of the BufReader?
It very much does, yes. lines() gives you an iterator, so you can read them without loading all of them into memory at once. You force them all into memory, though, as you call collect().
Simply don't do that. Use the iterator as an iterator. Especially as you convert it back to an iterator later, via geom_file_lines[1..].iter().
Like this:
use ndarray::prelude::*;
use std::fs;
use std::io::{BufRead, BufReader};
pub fn read_inputfile(geom_filename: &str) -> (Vec<i32>, Array2<f64>, usize) {
//* Step 1: Read the coord data from input
println!("Inputfile: {geom_filename}");
let geom_file = fs::File::open(geom_filename).expect("Geometry file not found!");
let geom_file_reader = BufReader::new(geom_file);
let mut geom_file_lines = geom_file_reader
.lines()
.map(|line| line.expect("Failed to read line!"));
//* Read no of atoms first for array size
let no_atoms: usize = geom_file_lines.next().unwrap().parse().unwrap();
let mut z_vals: Vec<i32> = Vec::new();
let mut geom_matr: Array2<f64> = Array2::zeros((no_atoms, 3));
for (atom_idx, line) in geom_file_lines.enumerate() {
let line_split: Vec<&str> = line.split_whitespace().collect();
z_vals.push(line_split[0].parse().unwrap());
(0..3).for_each(|cart_coord| {
geom_matr[(atom_idx, cart_coord)] = line_split[cart_coord + 1].parse().unwrap();
});
}
(z_vals, geom_matr, no_atoms)
}
You can apply the same logic in your for loop:
for (atom_idx, line) in geom_file_lines.enumerate() {
let mut line_split = line.split_whitespace();
z_vals.push(line_split.next().unwrap().parse().unwrap());
(0..3).for_each(|cart_coord| {
geom_matr[(atom_idx, cart_coord)] = line_split.next().unwrap().parse().unwrap();
});
}

Splitting a Vec of strings into Vec<Vec<String>>

I am attempting to relearn data-science in rust.
I have a Vec<String> that includes a delimiter "|" and a new line "!end".
What I'd like to end up with is Vec<Vec<String>> that can be put into a 2D ND array.
I have this python Code:
file = open('somefile.dat')
lst = []
for line in file:
lst += [line.split('|')]
df = pd.DataFrame(lst)
SAMV2FinalDataFrame = pd.DataFrame(lst,columns=column_names)
And i've recreated it here in rust:
fn lines_from_file(filename: impl AsRef<Path>) -> Vec<String> {
let file = File::open(filename).expect("no such file");
let buf = BufReader::new(file);
buf.lines()
.map(|l| l.expect("Could not parse line"))
.collect()
}
fn main() {
let lines = lines_from_file(".dat");
let mut new_arr = vec![];
//Here i get a lines immitable borrow
for line in lines{
new_arr.push([*line.split("!end")]);
}
// here i get expeected closure found str
let x = lines.split("!end");
let array = Array::from(lines)
what i have: ['1','1','1','end!','2','2','2','!end']
What i need: [['1','1','1'],['2','2','2']]
Edit: also why when i turbo fish does it make it disappear on Stack Overflow?
I think part of the issue you ran into was due how you worked with arrays. For example, Vec::push will only add a single element so you would want to use Vec::extend instead. I also ran into a few cases of empty strings due to splitting by "!end" would leave trailing '|' on the ends of substrings. The errors were quite strange, I am not completely sure where the closure came from.
let lines = vec!["1|1|1|!end|2|2|2|!end".to_string()];
let mut new_arr = Vec::new();
// Iterate over &lines so we don't consume lines and it can be used again later
for line in &lines {
new_arr.extend(line.split("!end")
// Remove trailing empty string
.filter(|x| !x.is_empty())
// Convert each &str into a Vec<String>
.map(|x| {
x.split('|')
// Remove empty strings from ends split (Ex split: "|2|2|2|")
.filter(|x| !x.is_empty())
// Convert &str into owned String
.map(|x| x.to_string())
// Turn iterator into Vec<String>
.collect::<Vec<_>>()
}));
}
println!("{:?}", new_arr);
I also came up with this other version which should handle your use case better. The earlier approach dropped all empty strings, while this one should preserve them while correctly handling the "!end".
use std::io::{self, BufRead, BufReader, Read, Cursor};
fn split_data<R: Read>(buffer: &mut R) -> io::Result<Vec<Vec<String>>> {
let mut sections = Vec::new();
let mut current_section = Vec::new();
for line in BufReader::new(buffer).lines() {
for item in line?.split('|') {
if item != "!end" {
current_section.push(item.to_string());
} else {
sections.push(current_section);
current_section = Vec::new();
}
}
}
Ok(sections)
}
In this example, I used Read for easier testing, but it will also work with a file.
let sample_input = b"1|1|1|!end|2|2|2|!end";
println!("{:?}", split_data(&mut Cursor::new(sample_input)));
// Output: Ok([["1", "1", "1"], ["2", "2", "2"]])
// You can also use a file instead
let mut file = File::new("somefile.dat");
let solution: Vec<Vec<String>> = split_data(&mut file).unwrap();
playground link

How to hash a binary file in Rust

In rust, using sha256 = "1.0.2" (or similar), how do I hash a binary file (i.e. a tar.gz archive)?
I'm trying to get the sha256 of that binary file.
This doesn't work:
fn hash() {
let file = "file.tar.gz";
let computed_hash = sha256::digest_file(std::path::Path::new(file)).unwrap();
computed_hash
}
the output is:
...
Error { kind: InvalidData, message: "stream did not contain valid UTF-8" }
The sha2 crate upon which depends supports hashing Readable objects without needing to read the entire file into memory. See the example in the hashes readme.
use sha2::{Sha256, Digest};
use std::{io, fs};
let mut hasher = Sha256::new();
let mut file = fs::File::open("file.tar.gz")?;
let bytes_written = io::copy(&mut file, &mut hasher)?;
let hash_bytes = hasher.finalize();
Edit:
Upgrading to sha256 = "1.0.3" should fix this
The issue is that digest_file is internally reading the file to a String, which requires that it contains valid UTF-8, which is obviously not what you want in this case.
Instead, you could read the file in as bytes and pass that into sha256::digest_bytes:
let bytes = std::fs::read(path).unwrap(); // Vec<u8>
let hash = sha256::digest_bytes(&bytes);
Here's an implementation using the sha2 crate that doesn't read the entire file into memory, and doesn't depend on the ring crate. In my case, ring isn't pure rust, which leads to cross-compilation difficulties.
use data_encoding::HEXLOWER;
use sha2::{Digest, Sha256};
use std::fs::File;
use std::io::{BufReader, Read};
use std::path::{Path, PathBuf};
/// calculates sha256 digest as lowercase hex string
fn sha256_digest(path: &PathBuf) -> Result<String> {
let input = File::open(path)?;
let mut reader = BufReader::new(input);
let digest = {
let mut hasher = Sha256::new();
let mut buffer = [0; 1024];
loop {
let count = reader.read(&mut buffer)?;
if count == 0 { break }
hasher.update(&buffer[..count]);
}
hasher.finalize()
};
Ok(HEXLOWER.encode(digest.as_ref()))
}

Is there a way of removing quotation marks when using the quote crate?

use quote::quote; // 1.0.7
use syn; // 1.0.40
fn main() {
let foo = "foo";
let foobar = syn::Ident::new(&format!("{}bar", foo), syn::export::Span::call_site());
let testing = format!("self.{}.is_some()", foobar);
let q = quote! {#testing};
println!("{}", q);
}
playground
The stdout is
"self.foobar.is_some()"
I need
self.foobar.is_some()
If you don't want to treat something as a string, don't use a string. Instead, quote the source code directly:
use quote::{format_ident, quote}; // 1.0.7
fn main() {
let foo = "foo";
let foobar = format_ident!("{}bar", foo);
let q = quote! { self.#foobar.is_some() };
println!("{}", q);
}
self . foobar . is_some ()
If your data has to be a string (e.g. you read it from a configuration file), then you can use syn to parse the string back into the AST and the apply the above solution. This does require that you need to know what to parse the string as.
In this case, we want to parse the whole string as an expression:
use quote::quote; // 1.0.7
use syn; // 1.0.40
fn main() {
let foobar = "self.foobar.is_some()";
let e: syn::Expr = syn::parse_str(foobar).expect("Unable to parse");
let q = quote! { #e };
println!("{}", q);
}

How to implement a lightweight long-lived thread based on a generator or asynchronous function in Rust?

I want to implement a user interaction script in the form of a lightweight, long-lived thread written in Rust. Inside the script, I have points where I asynchronously await user input.
In JavaScript, I would use a generator, inside which you can pass a question, and get back an answer, for example:
function* my_scenario() {
yield "What is your name?";
let my_name = yield "How are you feeling?";
let my_mood = yield "";
...
}
let my_session = my_scenario();
...
my_session.next("Peter");
my_session.next("happy");
However, Rust's generator method resume() contains no parameters! I cannot clone a generator or return it from a function in order to have many user sessions with different states. Instead of a generator, I thought of using an async fn(), but I do not understand how to call it at each step, passing the value there.
The return value from yield is effectively just another generator that has been implicitly passed to the first generator, except that it forces the two to be tied together in weird ways.
You can see that in your original code by the junk yield "" that you need in order to get a value even though you don't have anything to return. Additionally, your example requires that the user of the generator know the answer to the question before it is asked, which seems very unorthodox.
Explicitly pass in a second generator:
#![feature(generators, generator_trait)]
use std::{
io,
ops::{Generator, GeneratorState},
};
fn user_input() -> impl Generator<Yield = String> {
|| {
let input = io::stdin();
loop {
let mut line = String::new();
input.read_line(&mut line).unwrap();
yield line;
}
}
}
fn my_scenario(
input: impl Generator<Yield = String>,
) -> impl Generator<Yield = &'static str, Return = String> {
|| {
let mut input = Box::pin(input);
yield "What is your name?";
let my_name = match input.as_mut().resume(()) {
GeneratorState::Yielded(v) => v,
GeneratorState::Complete(_) => panic!("input did not return a value"),
};
yield "How are you feeling?";
let my_mood = match input.as_mut().resume(()) {
GeneratorState::Yielded(v) => v,
GeneratorState::Complete(_) => panic!("input did not return a value"),
};
format!("{} is {}", my_name.trim(), my_mood.trim())
}
}
fn main() {
let my_session = my_scenario(user_input());
let mut my_session = Box::pin(my_session);
loop {
match my_session.as_mut().resume(()) {
GeneratorState::Yielded(prompt) => {
println!("{}", prompt);
}
GeneratorState::Complete(v) => {
println!("{}", v);
break;
}
}
}
}
$ cargo run
What is your name?
Shep
How are you feeling?
OK
Shep is OK
You can provide hard-coded data as well:
let user_input = || {
yield "Peter".to_string();
yield "happy".to_string();
};
let my_session = my_scenario(user_input);
As of approximately Rust nightly 2020-02-08, Rust's generators now accept an argument to resume, more closely matching the original JavaScript example:
#![feature(generators, generator_trait)]
use std::{
io::{self, BufRead},
ops::{Generator, GeneratorState},
};
fn my_scenario() -> impl Generator<String, Yield = &'static str, Return = String> {
|_arg: String| {
let my_name = yield "What is your name?";
let my_mood = yield "How are you feeling?";
format!("{} is {}", my_name.trim(), my_mood.trim())
}
}
fn main() {
let my_session = my_scenario();
let mut my_session = Box::pin(my_session);
let stdin = io::stdin();
let mut lines = stdin.lock().lines();
let mut line = String::new();
loop {
match my_session.as_mut().resume(line) {
GeneratorState::Yielded(prompt) => {
println!("{}", prompt);
}
GeneratorState::Complete(v) => {
println!("{}", v);
break;
}
}
line = lines.next().expect("User input ended").expect("User input malformed");
}
}

Resources