chunked multithreading! 0u0 also conditional compilation of some stuff

This commit is contained in:
Lynne Megido 2021-02-06 21:51:20 +10:00
parent 8fc3f18466
commit e3466a8912
Signed by: lynnesbian
GPG key ID: F0A184B5213D9F90
5 changed files with 98 additions and 44 deletions

View file

@ -6,6 +6,11 @@ edition = "2018"
license = "GPL-3.0-or-later" license = "GPL-3.0-or-later"
#license-file = "LICENSE" #license-file = "LICENSE"
[features]
default = ["mini-buffer", "multi-threaded"]
mini-buffer = []
multi-threaded = []
[dependencies] [dependencies]
walkdir = "2.3.1" walkdir = "2.3.1"
log = "0.4.14" log = "0.4.14"

BIN
chunked

Binary file not shown.

View file

@ -0,0 +1,23 @@
use std::fmt;
use std::fmt::Formatter;
trait Format {
fn rename(f: &mut fmt::Formatter<'_>, from: &str, to: &str) -> fmt::Result;
fn unreadable(f: &mut fmt::Formatter<'_>, path: &str) -> fmt::Result;
fn unknown_type(f: &mut fmt::Formatter<'_>, path: &str) -> fmt::Result;
}
struct Script {}
impl Format for Script {
fn rename(f: &mut Formatter<'_>, from: &str, to: &str) -> fmt::Result {
write!(f, "mv {} {}", from, to)
}
fn unreadable(f: &mut Formatter<'_>, path: &str) -> fmt::Result {
write!(f, "# Failed to read {}", path)
}
fn unknown_type(f: &mut Formatter<'_>, path: &str) -> fmt::Result {
write!(f, "# Failed to detect mime type for {}", path)
}
}

View file

@ -6,7 +6,7 @@ use std::fs::File;
use std::io::{Read, Seek, SeekFrom}; use std::io::{Read, Seek, SeekFrom};
use smartstring::alias::String; use smartstring::alias::String;
use cached::proc_macro::cached; use cached::proc_macro::cached;
use log::{debug, warn}; // use log::{debug, warn};
// from looking at the files in https://github.com/bojand/infer/tree/master/src/matchers, the format with the largest // from looking at the files in https://github.com/bojand/infer/tree/master/src/matchers, the format with the largest
// buffer size requirement for identification requires 262 bytes, and the next largest buffer necessary is only 131 // buffer size requirement for identification requires 262 bytes, and the next largest buffer necessary is only 131

View file

@ -16,15 +16,17 @@
mod parameters; mod parameters;
mod inspectors; mod inspectors;
mod formats;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use walkdir::{WalkDir, DirEntry}; use walkdir::{WalkDir, DirEntry};
use mime_guess::Mime; use mime_guess::Mime;
use smartstring::alias::String; use smartstring::alias::String;
use clap::Clap; use clap::Clap;
use log::{debug, trace, info, warn, error}; use log::{debug, trace, info, warn};
use rayon::prelude::*; use rayon::prelude::*;
use std::fmt::{self, Display}; use std::fmt::{self, Display};
use xdg_mime::SharedMimeInfo;
struct Findings { struct Findings {
file: PathBuf, file: PathBuf,
@ -99,30 +101,7 @@ fn extension_from_path(path: &Path) -> Option<String> {
map(|e| String::from(e.to_string_lossy())) // Convert from OsStr to String map(|e| String::from(e.to_string_lossy())) // Convert from OsStr to String
} }
fn main() { fn scan_file(db: &SharedMimeInfo, entry: &DirEntry) -> Result<Findings, (ScanError, PathBuf)> {
let args = parameters::Parameters::parse();
let mut builder = env_logger::Builder::from_default_env();
builder
// .format(|buf, r| writeln!(buf, "{} - {}", r.level(), r.args()))
.format_module_path(false) // don't include module in logs, as it's not necessary
.format_timestamp(None) // don't include timestamps (unnecessary, and the feature flag is disabled anyway)
.init();
let db = xdg_mime::SharedMimeInfo::new();
debug!("Iterating directory: {:?}", args.dirs);
let stepper = WalkDir::new(&args.dirs).into_iter();
let entries: Vec<DirEntry> = stepper
.filter_entry(|e| wanted_file(&args, e)) // filter out unwanted files
.filter_map(|e| e.ok()) // ignore anything that fails, e.g. files we don't have read access on
.filter(|e| !e.file_type().is_dir()) // remove directories from the final list
.collect();
trace!("Found {} items to check", entries.len());
let results: Vec<Result<Findings, (ScanError, PathBuf)>> = entries
.par_iter()
.map(|entry: &DirEntry | {
// try to determine mimetype for this entry // try to determine mimetype for this entry
let result = inspectors::mime_type(&db, entry.path()); let result = inspectors::mime_type(&db, entry.path());
@ -160,9 +139,56 @@ fn main() {
valid, // make this a function valid, // make this a function
mime: result, mime: result,
}) })
})
}
fn scan_from_walkdir(db: &SharedMimeInfo, entries: Vec<DirEntry>) -> Vec<Result<Findings, (ScanError, PathBuf)>> {
#[cfg(feature = "multi-threaded")] {
// rather than using a standard par_iter, split the entries into chunks of 16 first.
// this allows each spawned thread to handle 16 files before before closing, rather than creating a new thread for
// each file. this leads to a pretty substantial speedup that i'm pretty substantially happy about 0u0
entries
.par_chunks(16) // split into chunks of 16
.flat_map(|chunk| chunk // return Vec<...> instead of Chunk<Vec<...>>
.iter() // iter over the chunk, which is a slice of DirEntry structs
.map(|entry| scan_file(db, entry))
.collect::<Vec<_>>()
)
.collect()
}
#[cfg(not(feature = "multi-threaded"))] {
entries
.iter()
.map(|entry: &DirEntry | scan_file(db, entry))
.collect()
}
}
fn main() {
let args = parameters::Parameters::parse();
let mut builder = env_logger::Builder::from_default_env();
builder
// .format(|buf, r| writeln!(buf, "{} - {}", r.level(), r.args()))
.format_module_path(false) // don't include module in logs, as it's not necessary
.format_timestamp(None) // don't include timestamps (unnecessary, and the feature flag is disabled anyway)
.target(env_logger::Target::Stdout) // log to stdout rather than stderr
.init();
let db = xdg_mime::SharedMimeInfo::new();
debug!("Iterating directory: {:?}", args.dirs);
let stepper = WalkDir::new(&args.dirs).into_iter();
let entries: Vec<DirEntry> = stepper
.filter_entry(|e| wanted_file(&args, e)) // filter out unwanted files
.filter_map(|e| e.ok()) // ignore anything that fails, e.g. files we don't have read access on
.filter(|e| !e.file_type().is_dir()) // remove directories from the final list
.collect(); .collect();
trace!("Found {} items to check", entries.len());
let results = scan_from_walkdir(&db, entries);
for result in results { for result in results {
match result { match result {
Ok(r) => { Ok(r) => {