chunked multithreading! 0u0 also conditional compilation of some stuff
This commit is contained in:
parent
8fc3f18466
commit
e3466a8912
5 changed files with 98 additions and 44 deletions
|
@ -6,6 +6,11 @@ edition = "2018"
|
||||||
license = "GPL-3.0-or-later"
|
license = "GPL-3.0-or-later"
|
||||||
#license-file = "LICENSE"
|
#license-file = "LICENSE"
|
||||||
|
|
||||||
|
[features]
|
||||||
|
default = ["mini-buffer", "multi-threaded"]
|
||||||
|
mini-buffer = []
|
||||||
|
multi-threaded = []
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
walkdir = "2.3.1"
|
walkdir = "2.3.1"
|
||||||
log = "0.4.14"
|
log = "0.4.14"
|
||||||
|
|
BIN
chunked
BIN
chunked
Binary file not shown.
|
@ -0,0 +1,23 @@
|
||||||
|
use std::fmt;
|
||||||
|
use std::fmt::Formatter;
|
||||||
|
|
||||||
|
trait Format {
|
||||||
|
fn rename(f: &mut fmt::Formatter<'_>, from: &str, to: &str) -> fmt::Result;
|
||||||
|
fn unreadable(f: &mut fmt::Formatter<'_>, path: &str) -> fmt::Result;
|
||||||
|
fn unknown_type(f: &mut fmt::Formatter<'_>, path: &str) -> fmt::Result;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Script {}
|
||||||
|
impl Format for Script {
|
||||||
|
fn rename(f: &mut Formatter<'_>, from: &str, to: &str) -> fmt::Result {
|
||||||
|
write!(f, "mv {} {}", from, to)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn unreadable(f: &mut Formatter<'_>, path: &str) -> fmt::Result {
|
||||||
|
write!(f, "# Failed to read {}", path)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn unknown_type(f: &mut Formatter<'_>, path: &str) -> fmt::Result {
|
||||||
|
write!(f, "# Failed to detect mime type for {}", path)
|
||||||
|
}
|
||||||
|
}
|
|
@ -6,7 +6,7 @@ use std::fs::File;
|
||||||
use std::io::{Read, Seek, SeekFrom};
|
use std::io::{Read, Seek, SeekFrom};
|
||||||
use smartstring::alias::String;
|
use smartstring::alias::String;
|
||||||
use cached::proc_macro::cached;
|
use cached::proc_macro::cached;
|
||||||
use log::{debug, warn};
|
// use log::{debug, warn};
|
||||||
|
|
||||||
// from looking at the files in https://github.com/bojand/infer/tree/master/src/matchers, the format with the largest
|
// from looking at the files in https://github.com/bojand/infer/tree/master/src/matchers, the format with the largest
|
||||||
// buffer size requirement for identification requires 262 bytes, and the next largest buffer necessary is only 131
|
// buffer size requirement for identification requires 262 bytes, and the next largest buffer necessary is only 131
|
||||||
|
|
78
src/main.rs
78
src/main.rs
|
@ -16,15 +16,17 @@
|
||||||
|
|
||||||
mod parameters;
|
mod parameters;
|
||||||
mod inspectors;
|
mod inspectors;
|
||||||
|
mod formats;
|
||||||
|
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use walkdir::{WalkDir, DirEntry};
|
use walkdir::{WalkDir, DirEntry};
|
||||||
use mime_guess::Mime;
|
use mime_guess::Mime;
|
||||||
use smartstring::alias::String;
|
use smartstring::alias::String;
|
||||||
use clap::Clap;
|
use clap::Clap;
|
||||||
use log::{debug, trace, info, warn, error};
|
use log::{debug, trace, info, warn};
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
use std::fmt::{self, Display};
|
use std::fmt::{self, Display};
|
||||||
|
use xdg_mime::SharedMimeInfo;
|
||||||
|
|
||||||
struct Findings {
|
struct Findings {
|
||||||
file: PathBuf,
|
file: PathBuf,
|
||||||
|
@ -99,30 +101,7 @@ fn extension_from_path(path: &Path) -> Option<String> {
|
||||||
map(|e| String::from(e.to_string_lossy())) // Convert from OsStr to String
|
map(|e| String::from(e.to_string_lossy())) // Convert from OsStr to String
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() {
|
fn scan_file(db: &SharedMimeInfo, entry: &DirEntry) -> Result<Findings, (ScanError, PathBuf)> {
|
||||||
let args = parameters::Parameters::parse();
|
|
||||||
let mut builder = env_logger::Builder::from_default_env();
|
|
||||||
builder
|
|
||||||
// .format(|buf, r| writeln!(buf, "{} - {}", r.level(), r.args()))
|
|
||||||
.format_module_path(false) // don't include module in logs, as it's not necessary
|
|
||||||
.format_timestamp(None) // don't include timestamps (unnecessary, and the feature flag is disabled anyway)
|
|
||||||
.init();
|
|
||||||
|
|
||||||
let db = xdg_mime::SharedMimeInfo::new();
|
|
||||||
debug!("Iterating directory: {:?}", args.dirs);
|
|
||||||
|
|
||||||
let stepper = WalkDir::new(&args.dirs).into_iter();
|
|
||||||
let entries: Vec<DirEntry> = stepper
|
|
||||||
.filter_entry(|e| wanted_file(&args, e)) // filter out unwanted files
|
|
||||||
.filter_map(|e| e.ok()) // ignore anything that fails, e.g. files we don't have read access on
|
|
||||||
.filter(|e| !e.file_type().is_dir()) // remove directories from the final list
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
trace!("Found {} items to check", entries.len());
|
|
||||||
|
|
||||||
let results: Vec<Result<Findings, (ScanError, PathBuf)>> = entries
|
|
||||||
.par_iter()
|
|
||||||
.map(|entry: &DirEntry | {
|
|
||||||
// try to determine mimetype for this entry
|
// try to determine mimetype for this entry
|
||||||
let result = inspectors::mime_type(&db, entry.path());
|
let result = inspectors::mime_type(&db, entry.path());
|
||||||
|
|
||||||
|
@ -160,9 +139,56 @@ fn main() {
|
||||||
valid, // make this a function
|
valid, // make this a function
|
||||||
mime: result,
|
mime: result,
|
||||||
})
|
})
|
||||||
})
|
|
||||||
|
}
|
||||||
|
|
||||||
|
fn scan_from_walkdir(db: &SharedMimeInfo, entries: Vec<DirEntry>) -> Vec<Result<Findings, (ScanError, PathBuf)>> {
|
||||||
|
#[cfg(feature = "multi-threaded")] {
|
||||||
|
// rather than using a standard par_iter, split the entries into chunks of 16 first.
|
||||||
|
// this allows each spawned thread to handle 16 files before before closing, rather than creating a new thread for
|
||||||
|
// each file. this leads to a pretty substantial speedup that i'm pretty substantially happy about 0u0
|
||||||
|
entries
|
||||||
|
.par_chunks(16) // split into chunks of 16
|
||||||
|
.flat_map(|chunk| chunk // return Vec<...> instead of Chunk<Vec<...>>
|
||||||
|
.iter() // iter over the chunk, which is a slice of DirEntry structs
|
||||||
|
.map(|entry| scan_file(db, entry))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
)
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(feature = "multi-threaded"))] {
|
||||||
|
entries
|
||||||
|
.iter()
|
||||||
|
.map(|entry: &DirEntry | scan_file(db, entry))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let args = parameters::Parameters::parse();
|
||||||
|
let mut builder = env_logger::Builder::from_default_env();
|
||||||
|
builder
|
||||||
|
// .format(|buf, r| writeln!(buf, "{} - {}", r.level(), r.args()))
|
||||||
|
.format_module_path(false) // don't include module in logs, as it's not necessary
|
||||||
|
.format_timestamp(None) // don't include timestamps (unnecessary, and the feature flag is disabled anyway)
|
||||||
|
.target(env_logger::Target::Stdout) // log to stdout rather than stderr
|
||||||
|
.init();
|
||||||
|
|
||||||
|
let db = xdg_mime::SharedMimeInfo::new();
|
||||||
|
debug!("Iterating directory: {:?}", args.dirs);
|
||||||
|
|
||||||
|
let stepper = WalkDir::new(&args.dirs).into_iter();
|
||||||
|
let entries: Vec<DirEntry> = stepper
|
||||||
|
.filter_entry(|e| wanted_file(&args, e)) // filter out unwanted files
|
||||||
|
.filter_map(|e| e.ok()) // ignore anything that fails, e.g. files we don't have read access on
|
||||||
|
.filter(|e| !e.file_type().is_dir()) // remove directories from the final list
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
|
trace!("Found {} items to check", entries.len());
|
||||||
|
|
||||||
|
let results = scan_from_walkdir(&db, entries);
|
||||||
|
|
||||||
for result in results {
|
for result in results {
|
||||||
match result {
|
match result {
|
||||||
Ok(r) => {
|
Ok(r) => {
|
||||||
|
|
Loading…
Reference in a new issue