chunked multithreading! 0u0 also conditional compilation of some stuff
This commit is contained in:
parent
8fc3f18466
commit
e3466a8912
5 changed files with 98 additions and 44 deletions
|
@ -6,6 +6,11 @@ edition = "2018"
|
|||
license = "GPL-3.0-or-later"
|
||||
#license-file = "LICENSE"
|
||||
|
||||
[features]
|
||||
default = ["mini-buffer", "multi-threaded"]
|
||||
mini-buffer = []
|
||||
multi-threaded = []
|
||||
|
||||
[dependencies]
|
||||
walkdir = "2.3.1"
|
||||
log = "0.4.14"
|
||||
|
|
BIN
chunked
BIN
chunked
Binary file not shown.
|
@ -0,0 +1,23 @@
|
|||
use std::fmt;
|
||||
use std::fmt::Formatter;
|
||||
|
||||
trait Format {
|
||||
fn rename(f: &mut fmt::Formatter<'_>, from: &str, to: &str) -> fmt::Result;
|
||||
fn unreadable(f: &mut fmt::Formatter<'_>, path: &str) -> fmt::Result;
|
||||
fn unknown_type(f: &mut fmt::Formatter<'_>, path: &str) -> fmt::Result;
|
||||
}
|
||||
|
||||
struct Script {}
|
||||
impl Format for Script {
|
||||
fn rename(f: &mut Formatter<'_>, from: &str, to: &str) -> fmt::Result {
|
||||
write!(f, "mv {} {}", from, to)
|
||||
}
|
||||
|
||||
fn unreadable(f: &mut Formatter<'_>, path: &str) -> fmt::Result {
|
||||
write!(f, "# Failed to read {}", path)
|
||||
}
|
||||
|
||||
fn unknown_type(f: &mut Formatter<'_>, path: &str) -> fmt::Result {
|
||||
write!(f, "# Failed to detect mime type for {}", path)
|
||||
}
|
||||
}
|
|
@ -6,7 +6,7 @@ use std::fs::File;
|
|||
use std::io::{Read, Seek, SeekFrom};
|
||||
use smartstring::alias::String;
|
||||
use cached::proc_macro::cached;
|
||||
use log::{debug, warn};
|
||||
// use log::{debug, warn};
|
||||
|
||||
// from looking at the files in https://github.com/bojand/infer/tree/master/src/matchers, the format with the largest
|
||||
// buffer size requirement for identification requires 262 bytes, and the next largest buffer necessary is only 131
|
||||
|
|
112
src/main.rs
112
src/main.rs
|
@ -16,15 +16,17 @@
|
|||
|
||||
mod parameters;
|
||||
mod inspectors;
|
||||
mod formats;
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
use walkdir::{WalkDir, DirEntry};
|
||||
use mime_guess::Mime;
|
||||
use smartstring::alias::String;
|
||||
use clap::Clap;
|
||||
use log::{debug, trace, info, warn, error};
|
||||
use log::{debug, trace, info, warn};
|
||||
use rayon::prelude::*;
|
||||
use std::fmt::{self, Display};
|
||||
use xdg_mime::SharedMimeInfo;
|
||||
|
||||
struct Findings {
|
||||
file: PathBuf,
|
||||
|
@ -99,6 +101,70 @@ fn extension_from_path(path: &Path) -> Option<String> {
|
|||
map(|e| String::from(e.to_string_lossy())) // Convert from OsStr to String
|
||||
}
|
||||
|
||||
fn scan_file(db: &SharedMimeInfo, entry: &DirEntry) -> Result<Findings, (ScanError, PathBuf)> {
|
||||
// try to determine mimetype for this entry
|
||||
let result = inspectors::mime_type(&db, entry.path());
|
||||
|
||||
if let Err(_) = result {
|
||||
// an error occurred while trying to read the file
|
||||
// error!("{}: {}", entry.path().to_string_lossy(), error);
|
||||
return Err((ScanError::File, entry.path().to_path_buf()));
|
||||
}
|
||||
|
||||
let result = result.unwrap();
|
||||
if result.is_none() {
|
||||
// the file was read successfully, but we were unable to determine its mimetype
|
||||
// warn!("Couldn't determine mimetype for {}", entry.path().to_string_lossy());
|
||||
return Err((ScanError::Mime, entry.path().to_path_buf()));
|
||||
}
|
||||
|
||||
let result = result.unwrap();
|
||||
|
||||
// set of known extensions for the given mimetype
|
||||
let known_exts = inspectors::mime_extension_lookup(result.clone());
|
||||
// file extension for this particular file
|
||||
let entry_ext = extension_from_path(entry.path());
|
||||
|
||||
let valid = match known_exts {
|
||||
// there is a known set of extensions for this mimetype, and the file has an extension
|
||||
Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_lowercase().into()),
|
||||
// there is a known set of extensions for this mimetype, but the file has no extension
|
||||
Some(_) => false,
|
||||
// there is no known set of extensions for this mimetype -- assume it's correct
|
||||
None => true
|
||||
};
|
||||
|
||||
Ok(Findings {
|
||||
file: entry.path().to_path_buf(),
|
||||
valid, // make this a function
|
||||
mime: result,
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
fn scan_from_walkdir(db: &SharedMimeInfo, entries: Vec<DirEntry>) -> Vec<Result<Findings, (ScanError, PathBuf)>> {
|
||||
#[cfg(feature = "multi-threaded")] {
|
||||
// rather than using a standard par_iter, split the entries into chunks of 16 first.
|
||||
// this allows each spawned thread to handle 16 files before before closing, rather than creating a new thread for
|
||||
// each file. this leads to a pretty substantial speedup that i'm pretty substantially happy about 0u0
|
||||
entries
|
||||
.par_chunks(16) // split into chunks of 16
|
||||
.flat_map(|chunk| chunk // return Vec<...> instead of Chunk<Vec<...>>
|
||||
.iter() // iter over the chunk, which is a slice of DirEntry structs
|
||||
.map(|entry| scan_file(db, entry))
|
||||
.collect::<Vec<_>>()
|
||||
)
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "multi-threaded"))] {
|
||||
entries
|
||||
.iter()
|
||||
.map(|entry: &DirEntry | scan_file(db, entry))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let args = parameters::Parameters::parse();
|
||||
let mut builder = env_logger::Builder::from_default_env();
|
||||
|
@ -106,6 +172,7 @@ fn main() {
|
|||
// .format(|buf, r| writeln!(buf, "{} - {}", r.level(), r.args()))
|
||||
.format_module_path(false) // don't include module in logs, as it's not necessary
|
||||
.format_timestamp(None) // don't include timestamps (unnecessary, and the feature flag is disabled anyway)
|
||||
.target(env_logger::Target::Stdout) // log to stdout rather than stderr
|
||||
.init();
|
||||
|
||||
let db = xdg_mime::SharedMimeInfo::new();
|
||||
|
@ -120,48 +187,7 @@ fn main() {
|
|||
|
||||
trace!("Found {} items to check", entries.len());
|
||||
|
||||
let results: Vec<Result<Findings, (ScanError, PathBuf)>> = entries
|
||||
.par_iter()
|
||||
.map(|entry: &DirEntry | {
|
||||
// try to determine mimetype for this entry
|
||||
let result = inspectors::mime_type(&db, entry.path());
|
||||
|
||||
if let Err(_) = result {
|
||||
// an error occurred while trying to read the file
|
||||
// error!("{}: {}", entry.path().to_string_lossy(), error);
|
||||
return Err((ScanError::File, entry.path().to_path_buf()));
|
||||
}
|
||||
|
||||
let result = result.unwrap();
|
||||
if result.is_none() {
|
||||
// the file was read successfully, but we were unable to determine its mimetype
|
||||
// warn!("Couldn't determine mimetype for {}", entry.path().to_string_lossy());
|
||||
return Err((ScanError::Mime, entry.path().to_path_buf()));
|
||||
}
|
||||
|
||||
let result = result.unwrap();
|
||||
|
||||
// set of known extensions for the given mimetype
|
||||
let known_exts = inspectors::mime_extension_lookup(result.clone());
|
||||
// file extension for this particular file
|
||||
let entry_ext = extension_from_path(entry.path());
|
||||
|
||||
let valid = match known_exts {
|
||||
// there is a known set of extensions for this mimetype, and the file has an extension
|
||||
Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_lowercase().into()),
|
||||
// there is a known set of extensions for this mimetype, but the file has no extension
|
||||
Some(_) => false,
|
||||
// there is no known set of extensions for this mimetype -- assume it's correct
|
||||
None => true
|
||||
};
|
||||
|
||||
Ok(Findings {
|
||||
file: entry.path().to_path_buf(),
|
||||
valid, // make this a function
|
||||
mime: result,
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
let results = scan_from_walkdir(&db, entries);
|
||||
|
||||
for result in results {
|
||||
match result {
|
||||
|
|
Loading…
Reference in a new issue