Bincode::deserialize_from with BufferedReader - rust

I am using a toy example, but the actual one is one scale of very large binary file that was written in the same manner.
Im trying to read from a very large binary file that i've written myself with bincode as well
The whole idea is to easily use Bincode in order to create big files and iterate on them later
#[macro_use]
extern crate serde;
extern crate bincode;
// use std::fs::File;
use bincode::serialize_into;
use std::io::BufWriter;
#[derive(Serialize, Deserialize, PartialEq, Debug)]
pub struct MyStruct {
counter: Vec<u32>,
offset: usize,
}
impl MyStruct {
// omitted for conciseness
pub fn new(counter: Vec<u32>, offset: usize) -> Self {
Self { counter, offset }
}
}
fn main() {
let mut vec = Vec::new();
vec.push(100);
vec.push(500);
vec.push(100);
vec.push(500);
vec.push(100);
vec.push(500);
let m = MyStruct::new(vec, 0);
// fill entries in the counter vector
let mut f = BufWriter::new(File::create("foo2.bar").unwrap());
serialize_into(&mut f, &m).unwrap();
let mut vec = Vec::new();
vec.push(100);
vec.push(600);
vec.push(100);
vec.push(500);
vec.push(100);
vec.push(600);
vec.push(500);
vec.push(101241241);
vec.push(600);
let m2 = MyStruct::new(vec, 35);
serialize_into(&mut f, &m2).unwrap();
drop(f);
// let mut buffer;
// buffer = BufReader::open_raw_file("foo2.bar").unwrap();
let input = File::open("foo2.bar").unwrap();
// let buffered = BufReader::new(input);
let mut vecs: Vec<MyStruct> = Vec::new();
loop {
match bincode::deserialize_from(&input) {
Ok(file) => vecs.push(file),
Err(error) => break,
};
}
}
However, due to read on raw file, it is very slow
How can I use BuffReader to speed up my code inside bincode::deserialize_from?
let input = File::open("foo2.bar").unwrap();
let buffered = BufReader::new(input);

Usage of BufReader:
let input = File::open("foo2.bar").unwrap();
let mut buffered = BufReader::new(input);
let mut vecs: Vec<MyStruct> = Vec::new();
loop {
match bincode::deserialize_from(&mut buffered) {
Ok(file) => vecs.push(file),
Err(error) => break,
};
}

Related

How to decode large file without runing out of memory?

I need to convert a ~4GB IBM866 encoded xml file into UTF-8. I tried this
and this crates, but with both of them I run out of memory.
I tried to do it this way:
fn ibm866_to_utf8(ibm866: &[u8]) -> Result<String, MyError> {
use encoding_rs::IBM866;
let (utf8, _, had_error) = IBM866.decode(ibm866);
if (had_error == true) {
Err(MyError::DecodingError)
} else {
Ok(utf8.to_string())
}
}
fn main() {
let path = "ibm866_file";
let mut file = File::open(path).unwrap();
let mut vec: Vec<u8> = Vec::with_capacity(file.metadata().unwrap().len() as usize);
file.read_to_end(&mut vec);
let utf8_string = ibm866_to_utf8(&vec).unwrap();
// write to file
}
I also tried to iterate file by lines like this:
fn main() {
let path = "ibm866_file";
let mut file = File::open(path).unwrap();
let mut reader = BufReader::new(file);
let mut utf8_string = String::new();
for line in reader.lines() {
let utf8_line = ibm866_to_utf8(line.unwrap().as_bytes())
utf8_string = format!("{}{}", utf8_string, utf8_line.unwrap());
}
// write to file
}
But it panics when reader meets non UTF-8 character.
How to decode large files properly?
link to file: https://drive.google.com/file/d/1fHFS5GWPhApoNRl3CRK-pRMNthIakcZY/view?usp=sharing
You need to use the streaming functionality of encoding_rs.
This requires a bit of boilerplate code, though, to properly feed the chunks read from a file into the conversion function.
This code seems to work on a simple example, as well as on your multi-GB large legends.xml file and reports no conversion errors.
use std::{
fs::File,
io::{Read, Write},
};
use encoding_rs::{CoderResult, IBM866};
const BUF_SIZE: usize = 4096;
struct ConversionBuffers {
buf1: [u8; BUF_SIZE],
buf2: [u8; BUF_SIZE],
buf1_active: bool,
content: usize,
}
impl ConversionBuffers {
fn new() -> Self {
Self {
buf1: [0; BUF_SIZE],
buf2: [0; BUF_SIZE],
buf1_active: true,
content: 0,
}
}
fn move_leftovers_and_flip(&mut self, consumed: usize) {
let (src, dst) = if self.buf1_active {
(&mut self.buf1, &mut self.buf2)
} else {
(&mut self.buf2, &mut self.buf1)
};
let leftover = self.content - consumed;
dst[..leftover].clone_from_slice(&src[consumed..self.content]);
self.buf1_active = !self.buf1_active;
self.content = leftover;
}
fn append(&mut self, append_action: impl FnOnce(&mut [u8]) -> usize) {
let buf = if self.buf1_active {
&mut self.buf1[self.content..]
} else {
&mut self.buf2[self.content..]
};
let appended = append_action(buf);
self.content += appended;
}
fn get_data(&mut self) -> &[u8] {
if self.buf1_active {
&self.buf1[..self.content]
} else {
&self.buf2[..self.content]
}
}
}
fn main() {
let mut decoder = IBM866.new_decoder();
let mut file_in = File::open("test_ibm866.txt").unwrap();
let mut file_out = File::create("out_utf8-2.txt").unwrap();
let mut buffer_in = ConversionBuffers::new();
let mut buffer_out = vec![0u8; decoder.max_utf8_buffer_length(BUF_SIZE).unwrap_or(BUF_SIZE)];
let mut file_eof = false;
let mut errors = false;
loop {
if !file_eof {
buffer_in.append(|buf| {
let num_read = file_in.read(buf).unwrap();
if num_read == 0 {
file_eof = true;
}
num_read
});
}
let (result, num_consumed, num_produced, had_error) =
decoder.decode_to_utf8(buffer_in.get_data(), &mut buffer_out, file_eof);
if had_error {
errors = true;
}
let produced_data = &buffer_out[..num_produced];
file_out.write_all(produced_data).unwrap();
if file_eof && result == CoderResult::InputEmpty {
break;
}
buffer_in.move_leftovers_and_flip(num_consumed);
}
println!("Had conversion errors: {:?}", errors);
}
As #BurntSushi5 pointed out, there is the encoding_rs_io crate that allows us to skip all the boilerplate code:
use std::fs::File;
use encoding_rs::IBM866;
use encoding_rs_io::DecodeReaderBytesBuilder;
fn main() {
let file_in = File::open("test_ibm866.txt").unwrap();
let mut file_out = File::create("out_utf8.txt").unwrap();
let mut decoded_stream = DecodeReaderBytesBuilder::new()
.encoding(Some(IBM866))
.build(file_in);
std::io::copy(&mut decoded_stream, &mut file_out).unwrap();
}

Why does multithreaded writing to a Vec<HashSet<&str>> using raw pointers segfault?

I want to modify a big vector from multiple threads in parallel.
Works fine: u32
use std::thread;
use std::sync::Arc;
fn main() {
let input = Arc::new([1u32, 2, 3, 4]);
let mut handles = Vec::new();
for t in 0..4 {
let inp = input.clone();
let handle = thread::spawn(move || unsafe {
let p = (inp.as_ptr() as *mut u32).offset(t as isize);
*p = inp[t] + t as u32 ;
});
handles.push(handle);
}
for h in handles {
h.join().unwrap();
}
println!("{:?}", input);
}
Segfaults: Vec<HashSet<&str>>
When I change the u32 to Vec<HashSet<&str>>, the pointer does not seem to work.
use std::thread;
use std::sync::Arc;
use std::collections::HashSet;
fn main() {
let mut a = HashSet::new();
a.insert("aaa");
let input = Arc::new(vec![a.clone(), a.clone(), a.clone(), a.clone()]);
let mut handles = Vec::new();
for _t in 0..4 {
let inp = input.clone();
let handle = thread::spawn(move || unsafe {
let p = (inp.as_ptr() as *mut Vec<HashSet<&str>>).offset(0);
(*p)[0].insert("bbb");
});
handles.push(handle);
}
for h in handles {
h.join().unwrap();
}
println!("{:?}", input);
}
What is the difference?
It is hard to say what is wrong with your initial code as it segfaults in the playground. You are likely invoking undefined behavior by taking a reference to immutable (!) vec and trying to mutate its elements by casting &Vec -> *mut Vec -> &mut Vec (on a method call). Multiple mutable references to the same thing are a big no-no. Besides, your code even uses the same HashSet ((*p)[0]) mutably in parallel, which, again, is undefined behavior.
The easiest way here would be to use crossbeam's scoped threads. They allow referencing stack variables, like your input. Vec can also give out distinct mutable references to its elements without using unsafe. Using this, your code seems to do the expected thing.
use crossbeam::thread;
use std::collections::HashSet;
fn main() {
let mut a = HashSet::new();
a.insert("aaa");
let mut input = vec![a.clone(), a.clone(), a.clone(), a.clone()];
thread::scope(|s| {
for set in &mut input {
s.spawn(move |_| {
set.insert("bbb");
});
}
}).unwrap();
println!("{:?}", input);
}
I have found the way:
use std::thread;
use std::sync::Arc;
use std::collections::HashSet;
fn main() {
let mut a = HashSet::new();
a.insert("aaa");
let input = Arc::new(vec![a.clone(), a.clone(), a.clone(), a.clone()]);
let mut handles = Vec::new();
for _t in 0..4 {
let inp = input.clone();
//let out = output.clone();
let handle = thread::spawn(move || unsafe {
let p = (inp.as_ptr() as *mut Vec<HashSet<&str>>).offset(0);
(*p)[0].insert("bbb");
});
handles.push(handle);
}
for h in handles {
h.join().unwrap();
}
println!("{:?}", input);
}
thanks for guys!

Deserializing newline-delimited JSON from a socket using Serde

I am trying to use serde for sending a JSON struct from a client to a server. A newline from the client to the server marks that the socket is done. My server looks like this
#[derive(Serialize, Deserialize, Debug)]
struct Point3D {
x: u32,
y: u32,
z: u32,
}
fn handle_client(mut stream: TcpStream) -> Result<(), Error> {
println!("Incoming connection from: {}", stream.peer_addr()?);
let mut buffer = [0; 512];
loop {
let bytes_read = stream.read(&mut buffer)?;
if bytes_read == 0 {
return Ok(());
}
let buf_str: &str = str::from_utf8(&buffer).expect("Boom");
let input: Point3D = serde_json::from_str(&buf_str)?;
let result: String = (input.x.pow(2) + input.y.pow(2) + input.z.pow(2)).to_string();
stream.write(result.as_bytes())?;
}
}
fn main() {
let args: Vec<_> = env::args().collect();
if args.len() != 2 {
eprintln!("Please provide --client or --server as argument");
std::process::exit(1);
}
if args[1] == "--server" {
let listener = TcpListener::bind("0.0.0.0:8888").expect("Could not bind");
for stream in listener.incoming() {
match stream {
Err(e) => eprintln!("failed: {}", e),
Ok(stream) => {
thread::spawn(move || {
handle_client(stream).unwrap_or_else(|error| eprintln!("{:?}", error));
});
}
}
}
} else if args[1] == "--client" {
let mut stream = TcpStream::connect("127.0.0.1:8888").expect("Could not connect to server");
println!("Please provide a 3D point as three comma separated integers");
loop {
let mut input = String::new();
let mut buffer: Vec<u8> = Vec::new();
stdin()
.read_line(&mut input)
.expect("Failed to read from stdin");
let parts: Vec<&str> = input.trim_matches('\n').split(',').collect();
let point = Point3D {
x: parts[0].parse().unwrap(),
y: parts[1].parse().unwrap(),
z: parts[2].parse().unwrap(),
};
stream
.write(serde_json::to_string(&point).unwrap().as_bytes())
.expect("Failed to write to server");
let mut reader = BufReader::new(&stream);
reader
.read_until(b'\n', &mut buffer)
.expect("Could not read into buffer");
print!(
"{}",
str::from_utf8(&buffer).expect("Could not write buffer as string")
);
}
}
}
How do I know what length of buffer to allocate before reading in the string? If my buffer is too large, serde fails to deserialize it with an error saying that there are invalid characters. Is there a better way to do this?
Place the TcpStream into a BufReader. This allows you to read until a specific byte (in this case a newline). You can then parse the read bytes with Serde:
use std::io::{BufRead, BufReader};
use std::io::Write;
fn handle_client(mut stream: TcpStream) -> Result<(), Error> {
let mut data = Vec::new();
let mut stream = BufReader::new(stream);
loop {
data.clear();
let bytes_read = stream.read_until(b'\n', &mut data)?;
if bytes_read == 0 {
return Ok(());
}
let input: Point3D = serde_json::from_slice(&data)?;
let value = input.x.pow(2) + input.y.pow(2) + input.z.pow(2);
write!(stream.get_mut(), "{}", value)?;
}
}
I'm being a little fancy by reusing the allocation of data, which means it's very important to reset the buffer at the beginning of each loop. I also avoid allocating memory for the result and just print directly to the output stream.

How can I change the return type of this function?

I'm going through the matasano crypto challenges using rust, with rust-crypto for the AES implementation. I have this function to do basic ECB mode encryption (basically taken nearly verbatim from the rust-crypto repository's example):
pub fn aes_enc_ecb_128(key: &[u8], data: &[u8])
-> Result<Vec<u8>, symmetriccipher::SymmetricCipherError> {
let mut encryptor = aes::ecb_encryptor(
aes::KeySize::KeySize128,
key,
blockmodes::NoPadding);
let mut final_result = Vec::<u8>::new();
let mut read_buffer = buffer::RefReadBuffer::new(data);
let mut buffer = [0; 4096];
let mut write_buffer = buffer::RefWriteBuffer::new(&mut buffer);
loop {
let result = encryptor.encrypt(&mut read_buffer,
&mut write_buffer,
true);
final_result.extend(write_buffer
.take_read_buffer()
.take_remaining().iter().map(|&i| i));
match result {
Ok(BufferResult::BufferUnderflow) => break,
Ok(_) => {},
Err(e) => return Err(e)
}
}
Ok(final_result)
}
The above version compiles with no problem, and works as expected. However, to make it fit with the rest of my error handling scheme I'd like to change the return type to Result<Vec<u8>,&'static str>. This is the function with that change applied:
pub fn aes_enc_ecb_128(key: &[u8], data: &[u8])
-> Result<Vec<u8>, &'static str> {
let mut encryptor = aes::ecb_encryptor(
aes::KeySize::KeySize128,
key,
blockmodes::NoPadding);
let mut final_result = Vec::<u8>::new();
let mut read_buffer = buffer::RefReadBuffer::new(data);
let mut buffer = [0; 4096];
let mut write_buffer = buffer::RefWriteBuffer::new(&mut buffer);
loop {
let result = encryptor.encrypt(&mut read_buffer,
&mut write_buffer,
true);
final_result.extend(write_buffer
.take_read_buffer()
.take_remaining().iter().map(|&i| i));
match result {
Ok(BufferResult::BufferUnderflow) => break,
Ok(_) => {},
Err(_) => return Err("Encryption failed")
}
}
Ok(final_result)
}
When I attempt to compile this version, I get the following error (paths removed for clarity):
error: source trait is private
let result = encryptor.encrypt(&mut read_buffer,
&mut write_buffer,
true);
error: source trait is private
let r = decryptor.decrypt(&mut read_buffer, &mut write_buffer, true);
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The only way I've been able to change this type is to wrap the original function in a conversion function like this:
pub fn converted_enc(key: &[u8], data: &[u8])
-> Result<Vec<u8>, &'static str> {
match aes_enc_ecb_128(key,data) {
Ok(v) => Ok(v),
Err(_) => Err("Encryption failed")
}
}
What should I do instead of the above in order to get the return value to fit with the rest of my API, and why is the more direct method failing?
I'm using the following versions of rust/cargo:
rustc 1.2.0-nightly (0cc99f9cc 2015-05-17) (built 2015-05-18)
cargo 0.2.0-nightly (ac61996 2015-05-17) (built 2015-05-17)
I think you have come across a bug of the compiler. Your code should compile
You can use crypto::symmetriccipher::Encryptor; as a workaround:
pub fn aes_enc_ecb_128(key: &[u8], data: &[u8])
-> Result<Vec<u8>, &'static str> {
use crypto::symmetriccipher::Encryptor;
let mut encryptor = aes::ecb_encryptor(
aes::KeySize::KeySize128,
key,
blockmodes::NoPadding);
let mut final_result = Vec::<u8>::new();
let mut read_buffer = buffer::RefReadBuffer::new(data);
let mut buffer = [0; 4096];
let mut write_buffer = buffer::RefWriteBuffer::new(&mut buffer);
loop {
let result = encryptor.encrypt(&mut read_buffer,
&mut write_buffer,
true);
final_result.extend(write_buffer
.take_read_buffer()
.take_remaining().iter().map(|&i| i));
match result {
Ok(BufferResult::BufferUnderflow) => break,
Ok(_) => {},
Err(_) => return Err("Encryption failed")
}
}
Ok(final_result)
}

How to slice a large Vec<i32> as &[u8]?

I don't know how to convert a Vec<i32> into a &[u8] slice.
fn main() {
let v: Vec<i32> = vec![1; 100_000_000];
let v_bytes: &[u8] = /* ... */;
}
I want to write a large Vec<i32> to a file so I can read it back at a future time.
You can use std::slice::from_raw_parts:
let v_bytes: &[u8] = unsafe {
std::slice::from_raw_parts(
v.as_ptr() as *const u8,
v.len() * std::mem::size_of::<i32>(),
)
};
Following the comments on this answer, you should wrap this code in a function and have the return value borrow the input, so that you use the borrow checker as far as possible:
fn as_u8_slice(v: &[i32]) -> &[u8] {
unsafe {
std::slice::from_raw_parts(
v.as_ptr() as *const u8,
v.len() * std::mem::size_of::<i32>(),
)
}
}
Since Rust 1.30, the best solution is to use slice::align_to:
fn main() {
let v: Vec<i32> = vec![1; 8];
let (head, body, tail) = unsafe { v.align_to::<u8>() };
assert!(head.is_empty());
assert!(tail.is_empty());
println!("{:#x?}", body);
}
This properly handles the cases where the alignment of the first type and the second type do not match. In this example, I ensure that the alignment of the i32 is greater than that of the u8 via the assert! statements.
I took #swizards answer and ran with it a bit to get the other side of the coin - reading the vector back in:
use std::fs::File;
use std::io::{Read, Write};
use std::{mem, slice};
fn as_u8_slice(v: &[i32]) -> &[u8] {
let element_size = mem::size_of::<i32>();
unsafe { slice::from_raw_parts(v.as_ptr() as *const u8, v.len() * element_size) }
}
fn from_u8(v: Vec<u8>) -> Vec<i32> {
let data = v.as_ptr();
let len = v.len();
let capacity = v.capacity();
let element_size = mem::size_of::<i32>();
// Make sure we have a proper amount of capacity (may be overkill)
assert_eq!(capacity % element_size, 0);
// Make sure we are going to read a full chunk of stuff
assert_eq!(len % element_size, 0);
unsafe {
// Don't allow the current vector to be dropped
// (which would invalidate the memory)
mem::forget(v);
Vec::from_raw_parts(
data as *mut i32,
len / element_size,
capacity / element_size,
)
}
}
fn do_write(filename: &str, v: &[i32]) {
let mut f = File::create(filename).unwrap();
f.write_all(as_u8_slice(v)).unwrap();
}
fn do_read(filename: &str) -> Vec<i32> {
let mut f = File::open(filename).unwrap();
let mut bytes = Vec::new();
f.read_to_end(&mut bytes).unwrap();
from_u8(bytes)
}
fn main() {
let v = vec![42; 10];
do_write("vector.dump", &v);
let v2 = do_read("vector.dump");
assert_eq!(v, v2);
println!("{:?}", v2)
}

Resources