diff --git a/Cargo.toml b/Cargo.toml index 58dd3e2..0c7b905 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,11 @@ edition = "2018" license = "GPL-3.0-or-later" #license-file = "LICENSE" +[features] +default = ["mini-buffer", "multi-threaded"] +mini-buffer = [] +multi-threaded = [] + [dependencies] walkdir = "2.3.1" log = "0.4.14" diff --git a/chunked b/chunked deleted file mode 100755 index 2840f3d..0000000 Binary files a/chunked and /dev/null differ diff --git a/src/formats.rs b/src/formats.rs index e69de29..2e0e7a9 100644 --- a/src/formats.rs +++ b/src/formats.rs @@ -0,0 +1,23 @@ +use std::fmt; +use std::fmt::Formatter; + +trait Format { + fn rename(f: &mut fmt::Formatter<'_>, from: &str, to: &str) -> fmt::Result; + fn unreadable(f: &mut fmt::Formatter<'_>, path: &str) -> fmt::Result; + fn unknown_type(f: &mut fmt::Formatter<'_>, path: &str) -> fmt::Result; +} + +struct Script {} +impl Format for Script { + fn rename(f: &mut Formatter<'_>, from: &str, to: &str) -> fmt::Result { + write!(f, "mv {} {}", from, to) + } + + fn unreadable(f: &mut Formatter<'_>, path: &str) -> fmt::Result { + write!(f, "# Failed to read {}", path) + } + + fn unknown_type(f: &mut Formatter<'_>, path: &str) -> fmt::Result { + write!(f, "# Failed to detect mime type for {}", path) + } +} \ No newline at end of file diff --git a/src/inspectors.rs b/src/inspectors.rs index 580fd09..fb32d5f 100644 --- a/src/inspectors.rs +++ b/src/inspectors.rs @@ -6,7 +6,7 @@ use std::fs::File; use std::io::{Read, Seek, SeekFrom}; use smartstring::alias::String; use cached::proc_macro::cached; -use log::{debug, warn}; +// use log::{debug, warn}; // from looking at the files in https://github.com/bojand/infer/tree/master/src/matchers, the format with the largest // buffer size requirement for identification requires 262 bytes, and the next largest buffer necessary is only 131 diff --git a/src/main.rs b/src/main.rs index 3d731f0..e074fcc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -16,15 +16,17 @@ mod parameters; mod inspectors; +mod formats; use std::path::{Path, PathBuf}; use walkdir::{WalkDir, DirEntry}; use mime_guess::Mime; use smartstring::alias::String; use clap::Clap; -use log::{debug, trace, info, warn, error}; +use log::{debug, trace, info, warn}; use rayon::prelude::*; use std::fmt::{self, Display}; +use xdg_mime::SharedMimeInfo; struct Findings { file: PathBuf, @@ -99,6 +101,70 @@ fn extension_from_path(path: &Path) -> Option { map(|e| String::from(e.to_string_lossy())) // Convert from OsStr to String } +fn scan_file(db: &SharedMimeInfo, entry: &DirEntry) -> Result { + // try to determine mimetype for this entry + let result = inspectors::mime_type(&db, entry.path()); + + if let Err(_) = result { + // an error occurred while trying to read the file + // error!("{}: {}", entry.path().to_string_lossy(), error); + return Err((ScanError::File, entry.path().to_path_buf())); + } + + let result = result.unwrap(); + if result.is_none() { + // the file was read successfully, but we were unable to determine its mimetype + // warn!("Couldn't determine mimetype for {}", entry.path().to_string_lossy()); + return Err((ScanError::Mime, entry.path().to_path_buf())); + } + + let result = result.unwrap(); + + // set of known extensions for the given mimetype + let known_exts = inspectors::mime_extension_lookup(result.clone()); + // file extension for this particular file + let entry_ext = extension_from_path(entry.path()); + + let valid = match known_exts { + // there is a known set of extensions for this mimetype, and the file has an extension + Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_lowercase().into()), + // there is a known set of extensions for this mimetype, but the file has no extension + Some(_) => false, + // there is no known set of extensions for this mimetype -- assume it's correct + None => true + }; + + Ok(Findings { + file: entry.path().to_path_buf(), + valid, // make this a function + mime: result, + }) + +} + +fn scan_from_walkdir(db: &SharedMimeInfo, entries: Vec) -> Vec> { + #[cfg(feature = "multi-threaded")] { + // rather than using a standard par_iter, split the entries into chunks of 16 first. + // this allows each spawned thread to handle 16 files before before closing, rather than creating a new thread for + // each file. this leads to a pretty substantial speedup that i'm pretty substantially happy about 0u0 + entries + .par_chunks(16) // split into chunks of 16 + .flat_map(|chunk| chunk // return Vec<...> instead of Chunk> + .iter() // iter over the chunk, which is a slice of DirEntry structs + .map(|entry| scan_file(db, entry)) + .collect::>() + ) + .collect() + } + + #[cfg(not(feature = "multi-threaded"))] { + entries + .iter() + .map(|entry: &DirEntry | scan_file(db, entry)) + .collect() + } +} + fn main() { let args = parameters::Parameters::parse(); let mut builder = env_logger::Builder::from_default_env(); @@ -106,6 +172,7 @@ fn main() { // .format(|buf, r| writeln!(buf, "{} - {}", r.level(), r.args())) .format_module_path(false) // don't include module in logs, as it's not necessary .format_timestamp(None) // don't include timestamps (unnecessary, and the feature flag is disabled anyway) + .target(env_logger::Target::Stdout) // log to stdout rather than stderr .init(); let db = xdg_mime::SharedMimeInfo::new(); @@ -120,48 +187,7 @@ fn main() { trace!("Found {} items to check", entries.len()); - let results: Vec> = entries - .par_iter() - .map(|entry: &DirEntry | { - // try to determine mimetype for this entry - let result = inspectors::mime_type(&db, entry.path()); - - if let Err(_) = result { - // an error occurred while trying to read the file - // error!("{}: {}", entry.path().to_string_lossy(), error); - return Err((ScanError::File, entry.path().to_path_buf())); - } - - let result = result.unwrap(); - if result.is_none() { - // the file was read successfully, but we were unable to determine its mimetype - // warn!("Couldn't determine mimetype for {}", entry.path().to_string_lossy()); - return Err((ScanError::Mime, entry.path().to_path_buf())); - } - - let result = result.unwrap(); - - // set of known extensions for the given mimetype - let known_exts = inspectors::mime_extension_lookup(result.clone()); - // file extension for this particular file - let entry_ext = extension_from_path(entry.path()); - - let valid = match known_exts { - // there is a known set of extensions for this mimetype, and the file has an extension - Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_lowercase().into()), - // there is a known set of extensions for this mimetype, but the file has no extension - Some(_) => false, - // there is no known set of extensions for this mimetype -- assume it's correct - None => true - }; - - Ok(Findings { - file: entry.path().to_path_buf(), - valid, // make this a function - mime: result, - }) - }) - .collect(); + let results = scan_from_walkdir(&db, entries); for result in results { match result {