implemented parallel functionality! 0u0

This commit is contained in:
Lynne Megido 2021-02-05 22:45:51 +10:00
parent 775fb306ad
commit 2c4a8f6a3b
Signed by: lynnesbian
GPG Key ID: F0A184B5213D9F90
4 changed files with 479 additions and 42 deletions

362
Cargo.lock generated
View File

@ -15,6 +15,26 @@ version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b"
[[package]]
name = "async-mutex"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "479db852db25d9dbf6204e6cb6253698f175c15726470f78af0d918e99d6156e"
dependencies = [
"event-listener",
]
[[package]]
name = "async-trait"
version = "0.1.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8d3a45e77e34375a7923b1e8febb049bb011f064714a8e17a1a616fef01da13d"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "atty"
version = "0.2.14"
@ -38,6 +58,40 @@ version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
[[package]]
name = "cached"
version = "0.23.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e2afe73808fbaac302e39c9754bfc3c4b4d0f99c9c240b9f4e4efc841ad1b74"
dependencies = [
"async-mutex",
"async-trait",
"cached_proc_macro",
"cached_proc_macro_types",
"futures",
"hashbrown",
"once_cell",
]
[[package]]
name = "cached_proc_macro"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf857ae42d910aede5c5186e62684b0d7a597ce2fe3bd14448ab8f7ef439848c"
dependencies = [
"async-mutex",
"cached_proc_macro_types",
"darling",
"quote",
"syn",
]
[[package]]
name = "cached_proc_macro_types"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a4f925191b4367301851c6d99b09890311d74b0d43f274c0b34c86d308a3663"
[[package]]
name = "cfg-if"
version = "0.1.10"
@ -62,8 +116,9 @@ dependencies = [
"indexmap",
"lazy_static",
"os_str_bytes",
"strsim",
"strsim 0.10.0",
"termcolor",
"terminal_size",
"textwrap",
"unicode-width",
"vec_map",
@ -82,6 +137,93 @@ dependencies = [
"syn",
]
[[package]]
name = "const_fn"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28b9d6de7f49e22cf97ad17fc4036ece69300032f45f78f30b4a4482cdc3f4a6"
[[package]]
name = "crossbeam-channel"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775"
dependencies = [
"cfg-if 1.0.0",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-deque"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9"
dependencies = [
"cfg-if 1.0.0",
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1aaa739f95311c2c7887a76863f500026092fb1dce0161dab577e559ef3569d"
dependencies = [
"cfg-if 1.0.0",
"const_fn",
"crossbeam-utils",
"lazy_static",
"memoffset",
"scopeguard",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02d96d1e189ef58269ebe5b97953da3274d83a93af647c2ddd6f9dab28cedb8d"
dependencies = [
"autocfg",
"cfg-if 1.0.0",
"lazy_static",
]
[[package]]
name = "darling"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0d706e75d87e35569db781a9b5e2416cff1236a47ed380831f959382ccd5f858"
dependencies = [
"darling_core",
"darling_macro",
]
[[package]]
name = "darling_core"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0c960ae2da4de88a91b2d920c2a7233b400bc33cb28453a2987822d8392519b"
dependencies = [
"fnv",
"ident_case",
"proc-macro2",
"quote",
"strsim 0.9.3",
"syn",
]
[[package]]
name = "darling_macro"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b5a2f4ac4969822c62224815d069952656cadc7084fdca9751e6d959189b72"
dependencies = [
"darling_core",
"quote",
"syn",
]
[[package]]
name = "dirs-next"
version = "2.0.0"
@ -103,6 +245,12 @@ dependencies = [
"winapi",
]
[[package]]
name = "either"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
[[package]]
name = "env_logger"
version = "0.8.2"
@ -116,19 +264,128 @@ dependencies = [
"termcolor",
]
[[package]]
name = "event-listener"
version = "2.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7531096570974c3a9dcf9e4b8e1cede1ec26cf5046219fb3b9d897503b9be59"
[[package]]
name = "fif"
version = "0.1.0"
dependencies = [
"cached",
"clap",
"env_logger",
"log",
"mime_guess",
"rayon",
"smartstring",
"walkdir",
"xdg-mime",
]
[[package]]
name = "fnv"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "futures"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da9052a1a50244d8d5aa9bf55cbc2fb6f357c86cc52e46c62ed390a7180cf150"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-channel"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846"
dependencies = [
"futures-core",
"futures-sink",
]
[[package]]
name = "futures-core"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65"
[[package]]
name = "futures-executor"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9e59fdc009a4b3096bf94f740a0f2424c082521f20a9b08c5c07c48d90fd9b9"
dependencies = [
"futures-core",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-io"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500"
[[package]]
name = "futures-macro"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd"
dependencies = [
"proc-macro-hack",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "futures-sink"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6"
[[package]]
name = "futures-task"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86"
dependencies = [
"once_cell",
]
[[package]]
name = "futures-util"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b"
dependencies = [
"futures-channel",
"futures-core",
"futures-io",
"futures-macro",
"futures-sink",
"futures-task",
"memchr",
"pin-project-lite",
"pin-utils",
"proc-macro-hack",
"proc-macro-nested",
"slab",
]
[[package]]
name = "getrandom"
version = "0.2.2"
@ -176,6 +433,12 @@ version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
[[package]]
name = "ident_case"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
[[package]]
name = "indexmap"
version = "1.6.1"
@ -226,6 +489,15 @@ version = "2.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525"
[[package]]
name = "memoffset"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "157b4208e3059a8f9e78d559edc658e13df41410cb3ae03979c83130067fdd87"
dependencies = [
"autocfg",
]
[[package]]
name = "mime"
version = "0.3.16"
@ -253,6 +525,16 @@ dependencies = [
"version_check",
]
[[package]]
name = "num_cpus"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3"
dependencies = [
"hermit-abi",
"libc",
]
[[package]]
name = "once_cell"
version = "1.5.2"
@ -265,6 +547,18 @@ version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "afb2e1c3ee07430c2cf76151675e583e0f19985fa6efae47d6848a3e2c824f85"
[[package]]
name = "pin-project-lite"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439697af366c49a6d0a010c56a0d97685bc140ce0d377b13a2ea2aa42d64a827"
[[package]]
name = "pin-utils"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "proc-macro-error"
version = "1.0.4"
@ -289,6 +583,18 @@ dependencies = [
"version_check",
]
[[package]]
name = "proc-macro-hack"
version = "0.5.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"
[[package]]
name = "proc-macro-nested"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086"
[[package]]
name = "proc-macro2"
version = "1.0.24"
@ -307,6 +613,31 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "rayon"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b0d8e0819fadc20c74ea8373106ead0600e3a67ef1fe8da56e39b9ae7275674"
dependencies = [
"autocfg",
"crossbeam-deque",
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ab346ac5921dc62ffa9f89b7a773907511cdfa5490c572ae9be1be33e8afa4a"
dependencies = [
"crossbeam-channel",
"crossbeam-deque",
"crossbeam-utils",
"lazy_static",
"num_cpus",
]
[[package]]
name = "redox_syscall"
version = "0.2.4"
@ -359,6 +690,18 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "scopeguard"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "slab"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8"
[[package]]
name = "smartstring"
version = "0.2.6"
@ -374,6 +717,12 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "strsim"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6446ced80d6c486436db5c078dde11a9f73d42b57fb273121e160b84f63d894c"
[[package]]
name = "strsim"
version = "0.10.0"
@ -400,12 +749,23 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "terminal_size"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86ca8ced750734db02076f44132d802af0b33b09942331f4459dde8636fd2406"
dependencies = [
"libc",
"winapi",
]
[[package]]
name = "textwrap"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "203008d98caf094106cfaba70acfed15e18ed3ddb7d94e49baec153a2b462789"
dependencies = [
"terminal_size",
"unicode-width",
]

View File

@ -8,11 +8,15 @@ license = "GPL-3.0-or-later"
[dependencies]
walkdir = "2.3.1"
#structopt = "0.3.21"
clap = "3.0.0-beta.2"
log = "0.4.14"
env_logger = "0.8.2"
smartstring = "0.2.6"
# use git version while waiting on a release incorporating https://github.com/ebassi/xdg-mime-rs/commit/de5a6dd
xdg-mime = {git = "https://github.com/ebassi/xdg-mime-rs", version = "0.3"}
mime_guess = "2.0.3"
rayon = "1.5.0"
cached = "0.23.0"
[dependencies.clap]
version = "3.0.0-beta.2"
features = ["wrap_help"]

View File

@ -1,28 +1,36 @@
// use xdg_mime::SharedMimeInfo;
// use std::path::Path;
// use std::io;
// use mime_guess::Mime;
// use std::fs::File;
// use std::io::Read;
use xdg_mime::SharedMimeInfo;
use std::path::Path;
use std::io;
use mime_guess::Mime;
use std::fs::File;
use std::io::Read;
use smartstring::alias::String;
use cached::proc_macro::cached;
// pub fn mime_type(db: &SharedMimeInfo, filepath: &Path) -> io::Result<Option<Mime>, > {
// // attempt to read up to the 256 bytes of the file
// let mut buffer = [0; 256];
// let mut file = File::open(filepath)?;
//
// file.read(&mut buffer)?;
//
// Ok(db.get_mime_type_for_data(&buffer).map(|m| m.0))
// }
pub fn mime_type(db: &SharedMimeInfo, path: &Path) -> io::Result<Option<Mime>, > {
// attempt to read up to the 256 bytes of the file
let mut buffer = [0; 256];
let mut file = File::open(path)?;
// pub fn get_ext_from_mime(mime: &Mime) -> Option<String> {
// match mime_guess::get_mime_extensions(mime) // get a list of possible extensions for this mime type
// .map(|g| g[0]) { // take the first option in the list and return it as a string
// // jpeg files are given the primary extension "jpe", due to the extension list being stored in alphabetical order.
// // to handle this particular case, swap "jpe" out for "jpg", and leave everything else the same, making sure we
// // convert the &strs to Strings.
// Some("jpe") => Some(String::from("jpg")),
// Some(ext) => Some(String::from(ext)),
// None => None
// }
// }
// this can be ignored because it's perfectly okay if the file is less than 256 bytes long - we only care about the
// first few bytes for the purpose of mime sniffing
#[allow(clippy::unused_io_amount)]
file.read(&mut buffer)?;
Ok(db.get_mime_type_for_data(&buffer).map(|m| m.0))
}
#[cached]
// TODO: avoid cloning mime if possible, although i don't really see how it would be - maybe instead of passing the mime
// object, pass a hash of it?
pub fn mime_extension_lookup(mime: Mime) -> Option<Vec<String>> {
if mime == mime_guess::mime::IMAGE_JPEG {
// jpeg files are given the primary extension "jpe", due to the extension list being stored in alphabetical order.
// to handle this particular case, return a custom vector consisting of just "jpg" and "jpeg".
return Some(vec![String::from("jpg"), String::from("jpeg")]);
}
match mime_guess::get_mime_extensions(&mime) { // get a list of possible extensions for this mime type
Some(exts) => Some(exts.iter().map(|e| String::from(*e)).collect()),
None => None
}
}

View File

@ -17,12 +17,26 @@
mod parameters;
mod inspectors;
use std::path::{Path};
use std::path::{Path, PathBuf};
use walkdir::{WalkDir, DirEntry};
use smartstring::alias::String;
// use structopt::StructOpt;
use clap::Clap;
use log::{info};
use log::{debug, info, warn, error};
use rayon::prelude::*;
use mime_guess::Mime;
struct Findings {
file: PathBuf,
valid: bool,
mime: Mime,
}
impl Findings {
fn recommended_extension(&self) -> Option<String> {
inspectors::mime_extension_lookup(self.mime.clone())
.map(|extensions| extensions[0].to_owned())
}
}
// TODO: test if this actually works on a windows machine
#[cfg(windows)]
@ -52,35 +66,86 @@ fn wanted_file(args: &parameters::Parameters, entry: &DirEntry) -> bool {
return true;
}
let ext = Path::new(entry.file_name()) // create a Path from the entry...
.extension() // get its extension...
.map(|e| String::from(e.to_string_lossy())); // and convert it from an OsStr to a String.
let ext = extension_from_path(entry.path());
if ext.is_none() { return false } // don't scan files without extensions. TODO - this should be configurable
if let Some(extensions) = &args.extensions {
// if the user has specified a list of extensions to check against, make sure this file ends in one of them.
// TODO - maybe use ascii_lowercase instead?
return extensions.contains(&ext.unwrap().to_ascii_lowercase().into())
return extensions.contains(&ext.unwrap().to_lowercase().into())
}
true
}
fn extension_from_path(path: &Path) -> Option<String> {
path.extension(). // Get the path's extension
map(|e| String::from(e.to_string_lossy())) // Convert from OsStr to String
}
fn main() {
let args = parameters::Parameters::parse();
// env_logger::init();
env_logger::init();
let db = xdg_mime::SharedMimeInfo::new();
println!("{:#?}", args);
// println!("{:#?}", args.dirs);
println!("=====\nIterating directory: {:?}\n=====", args.dirs);
debug!("=====\nIterating directory: {:?}\n=====", args.dirs);
let stepper = WalkDir::new(&args.dirs).into_iter();
let entries: Vec<DirEntry> = stepper
.filter_entry(|e| wanted_file(&args, e)) // filter out unwanted files
.filter_map(|e| e.ok()) // ignore anything that fails, e.g. files we don't have read access on
.filter(|e| !e.file_type().is_dir()) // remove directories from the final list
.collect();
info!("Found {} items to check", entries.len());
// println!("{:#?}", entries);
let results: Vec<Result<Findings, PathBuf>> = entries
.par_iter()
.map(|entry: &DirEntry | {
// try to determine mimetype for this entry
let result = inspectors::mime_type(&db, entry.path());
if let Err(error) = result {
// an error occurred while trying to read the file
error!("{}: {}", entry.path().to_string_lossy(), error);
return Err(entry.path().to_path_buf());
}
let result = result.unwrap();
if result.is_none() {
// the file was read successfully, but we were unable to determine its mimetype
warn!("Couldn't determine mimetype for {}", entry.path().to_string_lossy());
return Err(entry.path().to_path_buf());
}
let result = result.unwrap();
// set of known extensions for the given mimetype
let known_exts = inspectors::mime_extension_lookup(result.clone());
// file extension for this particular file
let entry_ext = extension_from_path(entry.path());
let valid = match known_exts {
// there is a known set of extensions for this mimetype, and the file has an extension
Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_lowercase().into()),
// there is a known set of extensions for this mimetype, but the file has no extension
Some(_) => false,
// there is no known set of extensions for this mimetype -- assume it's correct
None => true
};
Ok(Findings {
file: entry.path().to_path_buf(),
valid, // make this a function
mime: result,
})
})
.collect();
for result in results {
match result {
Ok(r) => info!("{:#?}: {:#?} - {:?} - {:?}", r.file, r.mime, r.valid, r.recommended_extension()),
Err(f) => warn!("{:#?}: Error 0uo", f)
}
}
debug!("Done");
}