chunked multithreading! 0u0 also conditional compilation of some stuff

This commit is contained in:
Lynne Megido 2021-02-06 21:51:20 +10:00
parent 8fc3f18466
commit e3466a8912
Signed by: lynnesbian
GPG key ID: F0A184B5213D9F90
5 changed files with 98 additions and 44 deletions

View file

@ -6,6 +6,11 @@ edition = "2018"
license = "GPL-3.0-or-later"
#license-file = "LICENSE"
[features]
default = ["mini-buffer", "multi-threaded"]
mini-buffer = []
multi-threaded = []
[dependencies]
walkdir = "2.3.1"
log = "0.4.14"

BIN
chunked

Binary file not shown.

View file

@ -0,0 +1,23 @@
use std::fmt;
use std::fmt::Formatter;
trait Format {
fn rename(f: &mut fmt::Formatter<'_>, from: &str, to: &str) -> fmt::Result;
fn unreadable(f: &mut fmt::Formatter<'_>, path: &str) -> fmt::Result;
fn unknown_type(f: &mut fmt::Formatter<'_>, path: &str) -> fmt::Result;
}
struct Script {}
impl Format for Script {
fn rename(f: &mut Formatter<'_>, from: &str, to: &str) -> fmt::Result {
write!(f, "mv {} {}", from, to)
}
fn unreadable(f: &mut Formatter<'_>, path: &str) -> fmt::Result {
write!(f, "# Failed to read {}", path)
}
fn unknown_type(f: &mut Formatter<'_>, path: &str) -> fmt::Result {
write!(f, "# Failed to detect mime type for {}", path)
}
}

View file

@ -6,7 +6,7 @@ use std::fs::File;
use std::io::{Read, Seek, SeekFrom};
use smartstring::alias::String;
use cached::proc_macro::cached;
use log::{debug, warn};
// use log::{debug, warn};
// from looking at the files in https://github.com/bojand/infer/tree/master/src/matchers, the format with the largest
// buffer size requirement for identification requires 262 bytes, and the next largest buffer necessary is only 131

View file

@ -16,15 +16,17 @@
mod parameters;
mod inspectors;
mod formats;
use std::path::{Path, PathBuf};
use walkdir::{WalkDir, DirEntry};
use mime_guess::Mime;
use smartstring::alias::String;
use clap::Clap;
use log::{debug, trace, info, warn, error};
use log::{debug, trace, info, warn};
use rayon::prelude::*;
use std::fmt::{self, Display};
use xdg_mime::SharedMimeInfo;
struct Findings {
file: PathBuf,
@ -99,6 +101,70 @@ fn extension_from_path(path: &Path) -> Option<String> {
map(|e| String::from(e.to_string_lossy())) // Convert from OsStr to String
}
fn scan_file(db: &SharedMimeInfo, entry: &DirEntry) -> Result<Findings, (ScanError, PathBuf)> {
// try to determine mimetype for this entry
let result = inspectors::mime_type(&db, entry.path());
if let Err(_) = result {
// an error occurred while trying to read the file
// error!("{}: {}", entry.path().to_string_lossy(), error);
return Err((ScanError::File, entry.path().to_path_buf()));
}
let result = result.unwrap();
if result.is_none() {
// the file was read successfully, but we were unable to determine its mimetype
// warn!("Couldn't determine mimetype for {}", entry.path().to_string_lossy());
return Err((ScanError::Mime, entry.path().to_path_buf()));
}
let result = result.unwrap();
// set of known extensions for the given mimetype
let known_exts = inspectors::mime_extension_lookup(result.clone());
// file extension for this particular file
let entry_ext = extension_from_path(entry.path());
let valid = match known_exts {
// there is a known set of extensions for this mimetype, and the file has an extension
Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_lowercase().into()),
// there is a known set of extensions for this mimetype, but the file has no extension
Some(_) => false,
// there is no known set of extensions for this mimetype -- assume it's correct
None => true
};
Ok(Findings {
file: entry.path().to_path_buf(),
valid, // make this a function
mime: result,
})
}
fn scan_from_walkdir(db: &SharedMimeInfo, entries: Vec<DirEntry>) -> Vec<Result<Findings, (ScanError, PathBuf)>> {
#[cfg(feature = "multi-threaded")] {
// rather than using a standard par_iter, split the entries into chunks of 16 first.
// this allows each spawned thread to handle 16 files before before closing, rather than creating a new thread for
// each file. this leads to a pretty substantial speedup that i'm pretty substantially happy about 0u0
entries
.par_chunks(16) // split into chunks of 16
.flat_map(|chunk| chunk // return Vec<...> instead of Chunk<Vec<...>>
.iter() // iter over the chunk, which is a slice of DirEntry structs
.map(|entry| scan_file(db, entry))
.collect::<Vec<_>>()
)
.collect()
}
#[cfg(not(feature = "multi-threaded"))] {
entries
.iter()
.map(|entry: &DirEntry | scan_file(db, entry))
.collect()
}
}
fn main() {
let args = parameters::Parameters::parse();
let mut builder = env_logger::Builder::from_default_env();
@ -106,6 +172,7 @@ fn main() {
// .format(|buf, r| writeln!(buf, "{} - {}", r.level(), r.args()))
.format_module_path(false) // don't include module in logs, as it's not necessary
.format_timestamp(None) // don't include timestamps (unnecessary, and the feature flag is disabled anyway)
.target(env_logger::Target::Stdout) // log to stdout rather than stderr
.init();
let db = xdg_mime::SharedMimeInfo::new();
@ -120,48 +187,7 @@ fn main() {
trace!("Found {} items to check", entries.len());
let results: Vec<Result<Findings, (ScanError, PathBuf)>> = entries
.par_iter()
.map(|entry: &DirEntry | {
// try to determine mimetype for this entry
let result = inspectors::mime_type(&db, entry.path());
if let Err(_) = result {
// an error occurred while trying to read the file
// error!("{}: {}", entry.path().to_string_lossy(), error);
return Err((ScanError::File, entry.path().to_path_buf()));
}
let result = result.unwrap();
if result.is_none() {
// the file was read successfully, but we were unable to determine its mimetype
// warn!("Couldn't determine mimetype for {}", entry.path().to_string_lossy());
return Err((ScanError::Mime, entry.path().to_path_buf()));
}
let result = result.unwrap();
// set of known extensions for the given mimetype
let known_exts = inspectors::mime_extension_lookup(result.clone());
// file extension for this particular file
let entry_ext = extension_from_path(entry.path());
let valid = match known_exts {
// there is a known set of extensions for this mimetype, and the file has an extension
Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_lowercase().into()),
// there is a known set of extensions for this mimetype, but the file has no extension
Some(_) => false,
// there is no known set of extensions for this mimetype -- assume it's correct
None => true
};
Ok(Findings {
file: entry.path().to_path_buf(),
valid, // make this a function
mime: result,
})
})
.collect();
let results = scan_from_walkdir(&db, entries);
for result in results {
match result {