Drain a Vec<_> from within a tokio task - rust

I have a buffer of type Vec<i32> which I want to write to disk and .drain() it after a certain time. The problem I'm facing is that of ownership. How can I write the contents of the buffer to disk and .drain() it without taking ownership or is it not possible in this case because of the tokio tasks?
use tokio::sync::mpsc;
use std::time::Duration;
use std::fs::File;
use std::io::BufWriter;
use serde_json;
#[tokio::main]
async fn main() {
let mut buffer: Vec<i32> = Vec::new();
let mut interval = tokio::time::interval(Duration::from_secs(60));
let (tx, mut rx) = mpsc::unbounded_channel();
let sender_handle = tokio::spawn(async move {
for i in 0..5 {
tx.send(i).unwrap();
}
});
let receiver_handle = tokio::spawn(async move {
while let Some(x) = rx.recv().await {
buffer.push(x);
}
});
let interval_handle = tokio::spawn(async move {
loop {
interval.tick().await;
let file: File::create("buffer.txt");
let mut writer = BufWriter::new(file.unwrap());
serde_json::to_writer(&mut writer, &buffer);
buffer.drain(..);
}
});
let (_, _, _) = tokio::join!(sender_handle, receiver_handle, interval_handle);
}

You need to add some sync mechanism so the access to the vector is controlled. An Arc<Mutex<Vec<_>>> would do:
use futures::lock::Mutex;
use std::sync::Arc;
use tokio::sync::mpsc;
use std::time::Duration;
use std::fs::File;
use std::io::BufWriter;
use serde_json;
#[tokio::main]
async fn main() {
let buffer: Arc<Mutex<Vec<i32>>> = Arc::new(Mutex::new(Vec::new()));
let mut interval = tokio::time::interval(Duration::from_secs(60));
let (tx, mut rx) = mpsc::unbounded_channel();
let sender_handle = tokio::spawn(async move {
for i in 0..5 {
tx.send(i).unwrap();
}
});
let receiver_buff = buffer.clone();
let receiver_handle = tokio::spawn(async move {
while let Some(x) = rx.recv().await {
let mut buff = receiver_buff.lock().await;
buff.push(x);
}
});
let interval_handle = tokio::spawn(async move {
let buffer = buffer.clone();
loop {
interval.tick().await;
let file = File::create("buffer.txt");
let mut writer = BufWriter::new(file.unwrap());
let mut buff = buffer.lock().await;
serde_json::to_writer(&mut writer, &*buff).expect("Serialization to go well");
buff.drain(..);
}
});
let (_, _, _) = tokio::join!(sender_handle, receiver_handle, interval_handle);
}
Playground

Related

How to decode large file without runing out of memory?

I need to convert a ~4GB IBM866 encoded xml file into UTF-8. I tried this
and this crates, but with both of them I run out of memory.
I tried to do it this way:
fn ibm866_to_utf8(ibm866: &[u8]) -> Result<String, MyError> {
use encoding_rs::IBM866;
let (utf8, _, had_error) = IBM866.decode(ibm866);
if (had_error == true) {
Err(MyError::DecodingError)
} else {
Ok(utf8.to_string())
}
}
fn main() {
let path = "ibm866_file";
let mut file = File::open(path).unwrap();
let mut vec: Vec<u8> = Vec::with_capacity(file.metadata().unwrap().len() as usize);
file.read_to_end(&mut vec);
let utf8_string = ibm866_to_utf8(&vec).unwrap();
// write to file
}
I also tried to iterate file by lines like this:
fn main() {
let path = "ibm866_file";
let mut file = File::open(path).unwrap();
let mut reader = BufReader::new(file);
let mut utf8_string = String::new();
for line in reader.lines() {
let utf8_line = ibm866_to_utf8(line.unwrap().as_bytes())
utf8_string = format!("{}{}", utf8_string, utf8_line.unwrap());
}
// write to file
}
But it panics when reader meets non UTF-8 character.
How to decode large files properly?
link to file: https://drive.google.com/file/d/1fHFS5GWPhApoNRl3CRK-pRMNthIakcZY/view?usp=sharing
You need to use the streaming functionality of encoding_rs.
This requires a bit of boilerplate code, though, to properly feed the chunks read from a file into the conversion function.
This code seems to work on a simple example, as well as on your multi-GB large legends.xml file and reports no conversion errors.
use std::{
fs::File,
io::{Read, Write},
};
use encoding_rs::{CoderResult, IBM866};
const BUF_SIZE: usize = 4096;
struct ConversionBuffers {
buf1: [u8; BUF_SIZE],
buf2: [u8; BUF_SIZE],
buf1_active: bool,
content: usize,
}
impl ConversionBuffers {
fn new() -> Self {
Self {
buf1: [0; BUF_SIZE],
buf2: [0; BUF_SIZE],
buf1_active: true,
content: 0,
}
}
fn move_leftovers_and_flip(&mut self, consumed: usize) {
let (src, dst) = if self.buf1_active {
(&mut self.buf1, &mut self.buf2)
} else {
(&mut self.buf2, &mut self.buf1)
};
let leftover = self.content - consumed;
dst[..leftover].clone_from_slice(&src[consumed..self.content]);
self.buf1_active = !self.buf1_active;
self.content = leftover;
}
fn append(&mut self, append_action: impl FnOnce(&mut [u8]) -> usize) {
let buf = if self.buf1_active {
&mut self.buf1[self.content..]
} else {
&mut self.buf2[self.content..]
};
let appended = append_action(buf);
self.content += appended;
}
fn get_data(&mut self) -> &[u8] {
if self.buf1_active {
&self.buf1[..self.content]
} else {
&self.buf2[..self.content]
}
}
}
fn main() {
let mut decoder = IBM866.new_decoder();
let mut file_in = File::open("test_ibm866.txt").unwrap();
let mut file_out = File::create("out_utf8-2.txt").unwrap();
let mut buffer_in = ConversionBuffers::new();
let mut buffer_out = vec![0u8; decoder.max_utf8_buffer_length(BUF_SIZE).unwrap_or(BUF_SIZE)];
let mut file_eof = false;
let mut errors = false;
loop {
if !file_eof {
buffer_in.append(|buf| {
let num_read = file_in.read(buf).unwrap();
if num_read == 0 {
file_eof = true;
}
num_read
});
}
let (result, num_consumed, num_produced, had_error) =
decoder.decode_to_utf8(buffer_in.get_data(), &mut buffer_out, file_eof);
if had_error {
errors = true;
}
let produced_data = &buffer_out[..num_produced];
file_out.write_all(produced_data).unwrap();
if file_eof && result == CoderResult::InputEmpty {
break;
}
buffer_in.move_leftovers_and_flip(num_consumed);
}
println!("Had conversion errors: {:?}", errors);
}
As #BurntSushi5 pointed out, there is the encoding_rs_io crate that allows us to skip all the boilerplate code:
use std::fs::File;
use encoding_rs::IBM866;
use encoding_rs_io::DecodeReaderBytesBuilder;
fn main() {
let file_in = File::open("test_ibm866.txt").unwrap();
let mut file_out = File::create("out_utf8.txt").unwrap();
let mut decoded_stream = DecodeReaderBytesBuilder::new()
.encoding(Some(IBM866))
.build(file_in);
std::io::copy(&mut decoded_stream, &mut file_out).unwrap();
}

Bincode::deserialize_from with BufferedReader

I am using a toy example, but the actual one is one scale of very large binary file that was written in the same manner.
Im trying to read from a very large binary file that i've written myself with bincode as well
The whole idea is to easily use Bincode in order to create big files and iterate on them later
#[macro_use]
extern crate serde;
extern crate bincode;
// use std::fs::File;
use bincode::serialize_into;
use std::io::BufWriter;
#[derive(Serialize, Deserialize, PartialEq, Debug)]
pub struct MyStruct {
counter: Vec<u32>,
offset: usize,
}
impl MyStruct {
// omitted for conciseness
pub fn new(counter: Vec<u32>, offset: usize) -> Self {
Self { counter, offset }
}
}
fn main() {
let mut vec = Vec::new();
vec.push(100);
vec.push(500);
vec.push(100);
vec.push(500);
vec.push(100);
vec.push(500);
let m = MyStruct::new(vec, 0);
// fill entries in the counter vector
let mut f = BufWriter::new(File::create("foo2.bar").unwrap());
serialize_into(&mut f, &m).unwrap();
let mut vec = Vec::new();
vec.push(100);
vec.push(600);
vec.push(100);
vec.push(500);
vec.push(100);
vec.push(600);
vec.push(500);
vec.push(101241241);
vec.push(600);
let m2 = MyStruct::new(vec, 35);
serialize_into(&mut f, &m2).unwrap();
drop(f);
// let mut buffer;
// buffer = BufReader::open_raw_file("foo2.bar").unwrap();
let input = File::open("foo2.bar").unwrap();
// let buffered = BufReader::new(input);
let mut vecs: Vec<MyStruct> = Vec::new();
loop {
match bincode::deserialize_from(&input) {
Ok(file) => vecs.push(file),
Err(error) => break,
};
}
}
However, due to read on raw file, it is very slow
How can I use BuffReader to speed up my code inside bincode::deserialize_from?
let input = File::open("foo2.bar").unwrap();
let buffered = BufReader::new(input);
Usage of BufReader:
let input = File::open("foo2.bar").unwrap();
let mut buffered = BufReader::new(input);
let mut vecs: Vec<MyStruct> = Vec::new();
loop {
match bincode::deserialize_from(&mut buffered) {
Ok(file) => vecs.push(file),
Err(error) => break,
};
}

Why does this program block until someone connects to the FIFO / named pipe?

I found this script in the post Recommended way of IPC in Rust where a server and client are created with named pipes.
I want to understand how it works so I started debugging. When I start the server with cargo run listen, the program reaches the open function and the following happens. I know this is a feature and not a bug, but I do not understand why it happens.
In the main function the listen function is called and then the listen function calls the open function:
use libc::{c_char, mkfifo};
use serde::{Deserialize, Serialize};
use std::env::args;
use std::fs::{File, OpenOptions};
use std::io::{Error, Read, Result, Write};
use std::os::unix::ffi::OsStrExt;
use std::path::{Path, PathBuf};
fn main() -> Result<()> {
let mut args = args();
let _ = args.next();
match args.next().as_ref().map(String::as_str) {
Some("listen") => listen()?,
Some("send") => {
let msg = args.next().unwrap();
send(msg)?;
}
_ => {
eprintln!("Please either listen or send.");
}
}
Ok(())
}
pub struct Fifo {
path: PathBuf,
}
impl Fifo {
pub fn new(path: PathBuf) -> Result<Self> {
let os_str = path.clone().into_os_string();
let slice = os_str.as_bytes();
let mut bytes = Vec::with_capacity(slice.len() + 1);
bytes.extend_from_slice(slice);
bytes.push(0); // zero terminated string
let _ = std::fs::remove_file(&path);
if unsafe { mkfifo((&bytes[0]) as *const u8 as *const c_char, 0o644) } != 0 {
Err(Error::last_os_error())
} else {
Ok(Fifo { path })
}
}
/// Blocks until anyone connects to this fifo.
pub fn open(&self) -> Result<FifoHandle> {
let mut pipe = OpenOptions::new().read(true).open(&self.path)?;
let mut pid_bytes = [0u8; 4];
pipe.read_exact(&mut pid_bytes)?;
let pid = u32::from_ne_bytes(pid_bytes);
drop(pipe);
let read = OpenOptions::new()
.read(true)
.open(format!("/tmp/rust-fifo-read.{}", pid))?;
let write = OpenOptions::new()
.write(true)
.open(format!("/tmp/rust-fifo-write.{}", pid))?;
Ok(FifoHandle { read, write })
}
}
impl Drop for Fifo {
fn drop(&mut self) {
let _ = std::fs::remove_file(&self.path);
}
}
#[derive(Serialize, Deserialize)]
pub enum Message {
Print(String),
Ack(),
}
pub struct FifoHandle {
read: File,
write: File,
}
impl FifoHandle {
pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
let pid = std::process::id();
let read_fifo_path = format!("/tmp/rust-fifo-write.{}", pid);
let read_fifo = Fifo::new(read_fifo_path.into())?;
let write_fifo_path = format!("/tmp/rust-fifo-read.{}", pid);
let write_fifo = Fifo::new(write_fifo_path.into())?;
let mut pipe = OpenOptions::new().write(true).open(path.as_ref())?;
let pid_bytes: [u8; 4] = u32::to_ne_bytes(pid);
pipe.write_all(&pid_bytes)?;
pipe.flush()?;
let write = OpenOptions::new().write(true).open(&write_fifo.path)?;
let read = OpenOptions::new().read(true).open(&read_fifo.path)?;
Ok(Self { read, write })
}
pub fn send_message(&mut self, msg: &Message) -> Result<()> {
let msg = bincode::serialize(msg).expect("Serialization failed");
self.write.write_all(&usize::to_ne_bytes(msg.len()))?;
self.write.write_all(&msg[..])?;
self.write.flush()
}
pub fn recv_message(&mut self) -> Result<Message> {
let mut len_bytes = [0u8; std::mem::size_of::<usize>()];
self.read.read_exact(&mut len_bytes)?;
let len = usize::from_ne_bytes(len_bytes);
let mut buf = vec![0; len];
self.read.read_exact(&mut buf[..])?;
Ok(bincode::deserialize(&buf[..]).expect("Deserialization failed"))
}
}
fn listen() -> Result<()> {
let fifo = Fifo::new(PathBuf::from("/tmp/rust-fifo"))?;
loop {
let mut handle = fifo.open()?;
std::thread::spawn(move || {
match handle.recv_message().expect("Failed to recieve message") {
Message::Print(p) => println!("{}", p),
Message::Ack() => panic!("Didn't expect Ack now."),
}
#[allow(deprecated)]
std::thread::sleep_ms(1000);
handle
.send_message(&Message::Ack())
.expect("Send message failed.");
});
}
}
fn send(s: String) -> Result<()> {
let mut handle = FifoHandle::open("/tmp/rust-fifo")?;
#[allow(deprecated)]
std::thread::sleep_ms(1000);
handle.send_message(&Message::Print(s))?;
match handle.recv_message()? {
Message::Print(p) => println!("{}", p),
Message::Ack() => {}
}
Ok(())
}

Why does multithreaded writing to a Vec<HashSet<&str>> using raw pointers segfault?

I want to modify a big vector from multiple threads in parallel.
Works fine: u32
use std::thread;
use std::sync::Arc;
fn main() {
let input = Arc::new([1u32, 2, 3, 4]);
let mut handles = Vec::new();
for t in 0..4 {
let inp = input.clone();
let handle = thread::spawn(move || unsafe {
let p = (inp.as_ptr() as *mut u32).offset(t as isize);
*p = inp[t] + t as u32 ;
});
handles.push(handle);
}
for h in handles {
h.join().unwrap();
}
println!("{:?}", input);
}
Segfaults: Vec<HashSet<&str>>
When I change the u32 to Vec<HashSet<&str>>, the pointer does not seem to work.
use std::thread;
use std::sync::Arc;
use std::collections::HashSet;
fn main() {
let mut a = HashSet::new();
a.insert("aaa");
let input = Arc::new(vec![a.clone(), a.clone(), a.clone(), a.clone()]);
let mut handles = Vec::new();
for _t in 0..4 {
let inp = input.clone();
let handle = thread::spawn(move || unsafe {
let p = (inp.as_ptr() as *mut Vec<HashSet<&str>>).offset(0);
(*p)[0].insert("bbb");
});
handles.push(handle);
}
for h in handles {
h.join().unwrap();
}
println!("{:?}", input);
}
What is the difference?
It is hard to say what is wrong with your initial code as it segfaults in the playground. You are likely invoking undefined behavior by taking a reference to immutable (!) vec and trying to mutate its elements by casting &Vec -> *mut Vec -> &mut Vec (on a method call). Multiple mutable references to the same thing are a big no-no. Besides, your code even uses the same HashSet ((*p)[0]) mutably in parallel, which, again, is undefined behavior.
The easiest way here would be to use crossbeam's scoped threads. They allow referencing stack variables, like your input. Vec can also give out distinct mutable references to its elements without using unsafe. Using this, your code seems to do the expected thing.
use crossbeam::thread;
use std::collections::HashSet;
fn main() {
let mut a = HashSet::new();
a.insert("aaa");
let mut input = vec![a.clone(), a.clone(), a.clone(), a.clone()];
thread::scope(|s| {
for set in &mut input {
s.spawn(move |_| {
set.insert("bbb");
});
}
}).unwrap();
println!("{:?}", input);
}
I have found the way:
use std::thread;
use std::sync::Arc;
use std::collections::HashSet;
fn main() {
let mut a = HashSet::new();
a.insert("aaa");
let input = Arc::new(vec![a.clone(), a.clone(), a.clone(), a.clone()]);
let mut handles = Vec::new();
for _t in 0..4 {
let inp = input.clone();
//let out = output.clone();
let handle = thread::spawn(move || unsafe {
let p = (inp.as_ptr() as *mut Vec<HashSet<&str>>).offset(0);
(*p)[0].insert("bbb");
});
handles.push(handle);
}
for h in handles {
h.join().unwrap();
}
println!("{:?}", input);
}
thanks for guys!

Deserializing newline-delimited JSON from a socket using Serde

I am trying to use serde for sending a JSON struct from a client to a server. A newline from the client to the server marks that the socket is done. My server looks like this
#[derive(Serialize, Deserialize, Debug)]
struct Point3D {
x: u32,
y: u32,
z: u32,
}
fn handle_client(mut stream: TcpStream) -> Result<(), Error> {
println!("Incoming connection from: {}", stream.peer_addr()?);
let mut buffer = [0; 512];
loop {
let bytes_read = stream.read(&mut buffer)?;
if bytes_read == 0 {
return Ok(());
}
let buf_str: &str = str::from_utf8(&buffer).expect("Boom");
let input: Point3D = serde_json::from_str(&buf_str)?;
let result: String = (input.x.pow(2) + input.y.pow(2) + input.z.pow(2)).to_string();
stream.write(result.as_bytes())?;
}
}
fn main() {
let args: Vec<_> = env::args().collect();
if args.len() != 2 {
eprintln!("Please provide --client or --server as argument");
std::process::exit(1);
}
if args[1] == "--server" {
let listener = TcpListener::bind("0.0.0.0:8888").expect("Could not bind");
for stream in listener.incoming() {
match stream {
Err(e) => eprintln!("failed: {}", e),
Ok(stream) => {
thread::spawn(move || {
handle_client(stream).unwrap_or_else(|error| eprintln!("{:?}", error));
});
}
}
}
} else if args[1] == "--client" {
let mut stream = TcpStream::connect("127.0.0.1:8888").expect("Could not connect to server");
println!("Please provide a 3D point as three comma separated integers");
loop {
let mut input = String::new();
let mut buffer: Vec<u8> = Vec::new();
stdin()
.read_line(&mut input)
.expect("Failed to read from stdin");
let parts: Vec<&str> = input.trim_matches('\n').split(',').collect();
let point = Point3D {
x: parts[0].parse().unwrap(),
y: parts[1].parse().unwrap(),
z: parts[2].parse().unwrap(),
};
stream
.write(serde_json::to_string(&point).unwrap().as_bytes())
.expect("Failed to write to server");
let mut reader = BufReader::new(&stream);
reader
.read_until(b'\n', &mut buffer)
.expect("Could not read into buffer");
print!(
"{}",
str::from_utf8(&buffer).expect("Could not write buffer as string")
);
}
}
}
How do I know what length of buffer to allocate before reading in the string? If my buffer is too large, serde fails to deserialize it with an error saying that there are invalid characters. Is there a better way to do this?
Place the TcpStream into a BufReader. This allows you to read until a specific byte (in this case a newline). You can then parse the read bytes with Serde:
use std::io::{BufRead, BufReader};
use std::io::Write;
fn handle_client(mut stream: TcpStream) -> Result<(), Error> {
let mut data = Vec::new();
let mut stream = BufReader::new(stream);
loop {
data.clear();
let bytes_read = stream.read_until(b'\n', &mut data)?;
if bytes_read == 0 {
return Ok(());
}
let input: Point3D = serde_json::from_slice(&data)?;
let value = input.x.pow(2) + input.y.pow(2) + input.z.pow(2);
write!(stream.get_mut(), "{}", value)?;
}
}
I'm being a little fancy by reusing the allocation of data, which means it's very important to reset the buffer at the beginning of each loop. I also avoid allocating memory for the result and just print directly to the output stream.

Resources