rudimentary script output, lots of small stuff, performance improvements

This commit is contained in:
Lynne Megido 2021-02-10 19:20:22 +10:00
parent 9be33cd90f
commit 82bdbebec5
Signed by: lynnesbian
GPG key ID: F0A184B5213D9F90
6 changed files with 155 additions and 172 deletions

175
Cargo.lock generated
View file

@ -15,17 +15,6 @@ dependencies = [
"event-listener", "event-listener",
] ]
[[package]]
name = "async-trait"
version = "0.1.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8d3a45e77e34375a7923b1e8febb049bb011f064714a8e17a1a616fef01da13d"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "atty" name = "atty"
version = "0.2.14" version = "0.2.14"
@ -56,10 +45,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e2afe73808fbaac302e39c9754bfc3c4b4d0f99c9c240b9f4e4efc841ad1b74" checksum = "5e2afe73808fbaac302e39c9754bfc3c4b4d0f99c9c240b9f4e4efc841ad1b74"
dependencies = [ dependencies = [
"async-mutex", "async-mutex",
"async-trait",
"cached_proc_macro", "cached_proc_macro",
"cached_proc_macro_types", "cached_proc_macro_types",
"futures",
"hashbrown", "hashbrown",
"once_cell", "once_cell",
] ]
@ -270,6 +257,7 @@ dependencies = [
"mime_guess", "mime_guess",
"rayon", "rayon",
"smartstring", "smartstring",
"snailquote",
"walkdir", "walkdir",
"xdg-mime", "xdg-mime",
] ]
@ -280,101 +268,6 @@ version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "futures"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da9052a1a50244d8d5aa9bf55cbc2fb6f357c86cc52e46c62ed390a7180cf150"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-channel"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846"
dependencies = [
"futures-core",
"futures-sink",
]
[[package]]
name = "futures-core"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65"
[[package]]
name = "futures-executor"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9e59fdc009a4b3096bf94f740a0f2424c082521f20a9b08c5c07c48d90fd9b9"
dependencies = [
"futures-core",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-io"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500"
[[package]]
name = "futures-macro"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd"
dependencies = [
"proc-macro-hack",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "futures-sink"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6"
[[package]]
name = "futures-task"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86"
dependencies = [
"once_cell",
]
[[package]]
name = "futures-util"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b"
dependencies = [
"futures-channel",
"futures-core",
"futures-io",
"futures-macro",
"futures-sink",
"futures-task",
"memchr",
"pin-project-lite",
"pin-utils",
"proc-macro-hack",
"proc-macro-nested",
"slab",
]
[[package]] [[package]]
name = "getrandom" name = "getrandom"
version = "0.2.2" version = "0.2.2"
@ -530,18 +423,6 @@ version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "afb2e1c3ee07430c2cf76151675e583e0f19985fa6efae47d6848a3e2c824f85" checksum = "afb2e1c3ee07430c2cf76151675e583e0f19985fa6efae47d6848a3e2c824f85"
[[package]]
name = "pin-project-lite"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439697af366c49a6d0a010c56a0d97685bc140ce0d377b13a2ea2aa42d64a827"
[[package]]
name = "pin-utils"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]] [[package]]
name = "proc-macro-error" name = "proc-macro-error"
version = "1.0.4" version = "1.0.4"
@ -566,18 +447,6 @@ dependencies = [
"version_check", "version_check",
] ]
[[package]]
name = "proc-macro-hack"
version = "0.5.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"
[[package]]
name = "proc-macro-nested"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086"
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.24" version = "1.0.24"
@ -661,12 +530,6 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "slab"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8"
[[package]] [[package]]
name = "smartstring" name = "smartstring"
version = "0.2.6" version = "0.2.6"
@ -676,6 +539,16 @@ dependencies = [
"static_assertions", "static_assertions",
] ]
[[package]]
name = "snailquote"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f34b729d802f52194598858ac852c3fb3b33f6e026cd03195072ccb7bf3fc810"
dependencies = [
"thiserror",
"unicode_categories",
]
[[package]] [[package]]
name = "static_assertions" name = "static_assertions"
version = "1.1.0" version = "1.1.0"
@ -734,6 +607,26 @@ dependencies = [
"unicode-width", "unicode-width",
] ]
[[package]]
name = "thiserror"
version = "1.0.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76cc616c6abf8c8928e2fdcc0dbfab37175edd8fb49a4641066ad1364fdab146"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9be73a2caec27583d0046ef3796c3794f868a5bc813db689eed00c7631275cd1"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "unicase" name = "unicase"
version = "2.6.0" version = "2.6.0"
@ -761,6 +654,12 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
[[package]]
name = "unicode_categories"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
[[package]] [[package]]
name = "vec_map" name = "vec_map"
version = "0.8.2" version = "0.8.2"

View file

@ -19,7 +19,7 @@ smartstring = "0.2.6"
xdg-mime = {git = "https://github.com/ebassi/xdg-mime-rs", version = "0.3"} xdg-mime = {git = "https://github.com/ebassi/xdg-mime-rs", version = "0.3"}
mime_guess = "2.0.3" mime_guess = "2.0.3"
rayon = "1.5.0" rayon = "1.5.0"
cached = "0.23.0" snailquote = "0.3.0"
[dependencies.clap] [dependencies.clap]
version = "3.0.0-beta.2" version = "3.0.0-beta.2"
@ -30,5 +30,15 @@ version = "0.8.2"
default-features = false default-features = false
features = ["termcolor", "atty"] features = ["termcolor", "atty"]
[dependencies.cached]
version = "0.23.0"
default-features = false
features = ["proc_macro"]
[profile.release] [profile.release]
lto = "thin" lto = "thin"
# optimise dependencies, even when producing debug builds
[profile.dev.package."*"]
opt-level = 3

View file

@ -1,23 +1,79 @@
use std::fmt; use std::io;
use std::fmt::Formatter; use std::io::Write;
use crate::Findings;
use crate::scanerror::ScanError;
use std::path::PathBuf;
use snailquote::escape;
trait Format { type Entries = [Result<Findings, (ScanError, PathBuf)>];
fn rename(f: &mut fmt::Formatter<'_>, from: &str, to: &str) -> fmt::Result;
fn unreadable(f: &mut fmt::Formatter<'_>, path: &str) -> fmt::Result; pub trait Format {
fn unknown_type(f: &mut fmt::Formatter<'_>, path: &str) -> fmt::Result; fn new() -> Self;
fn rename<T: Write>(&self, f: &mut T, from: &str, to: &str) -> io::Result<()>;
fn no_known_extension<T: Write>(&self, f: &mut T, path: &str) -> io::Result<()>;
fn unreadable<T: Write>(&self, f: &mut T, path: &str) -> io::Result<()>;
fn unknown_type<T: Write>(&self, f: &mut T, path: &str) -> io::Result<()>;
fn write_all<T: Write>(&self, entries: &Entries, f: &mut T) -> io::Result<()> {
// TODO: clean this up - it's horrifying
for entry in entries {
match entry {
Ok(finding) => {
// the file was successfully scanned, and a mimetype was detected
if !finding.valid {
// the file's extension is wrong!
match finding.recommended_extension() {
Some(ext) => {
// there's a known extension for this mimetype!!
self.rename(
f,
&finding.file.to_string_lossy(),
&finding.file.with_extension(ext.as_str()).to_string_lossy()
)?
}
None => {
// unfortunately, there's no known extension for this mimetype :(
self.no_known_extension(f, &finding.file.to_string_lossy())?
}
}
}
}
Err(error) => {
// something went wrong 0uo
match error.0 {
// failed to read the file
ScanError::File => self.unreadable(f, &error.1.to_string_lossy())?,
// file was read successfully, but we couldn't determine a mimetype
ScanError::Mime => self.unknown_type(f, &error.1.to_string_lossy())?
}
}
}
}
Ok(())
}
} }
struct Script {} pub struct Script {}
impl Format for Script { impl Format for Script {
fn rename(f: &mut Formatter<'_>, from: &str, to: &str) -> fmt::Result { fn new() -> Self {
write!(f, "mv {} {}", from, to) return Script {}
} }
fn unreadable(f: &mut Formatter<'_>, path: &str) -> fmt::Result { fn rename<T: Write>(&self, f: &mut T, from: &str, to: &str) -> io::Result<()> {
write!(f, "# Failed to read {}", path) // TODO: string escaping aaaaaaAAAAAAAAAA
writeln!(f, "mv -v -i -- {} {}", escape(from), escape(to))
} }
fn unknown_type(f: &mut Formatter<'_>, path: &str) -> fmt::Result { fn no_known_extension<T: Write>(&self, f: &mut T, path: &str) -> io::Result<()> {
write!(f, "# Failed to detect mime type for {}", path) writeln!(f, "echo No known extension for {}!", escape(path))
}
fn unreadable<T: Write>(&self, f: &mut T, path: &str) -> io::Result<()> {
writeln!(f, "# Failed to read {}", escape(path))
}
fn unknown_type<T: Write>(&self, f: &mut T, path: &str) -> io::Result<()> {
writeln!(f, "# Failed to detect mime type for {}", escape(path))
} }
} }

View file

@ -17,6 +17,7 @@
mod parameters; mod parameters;
mod inspectors; mod inspectors;
mod formats; mod formats;
mod scanerror;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use walkdir::{WalkDir, DirEntry}; use walkdir::{WalkDir, DirEntry};
@ -27,8 +28,12 @@ use log::{debug, trace, info, warn};
use rayon::prelude::*; use rayon::prelude::*;
use std::fmt::{self, Display}; use std::fmt::{self, Display};
use xdg_mime::SharedMimeInfo; use xdg_mime::SharedMimeInfo;
use crate::parameters::OutputFormat;
use crate::scanerror::ScanError;
use crate::formats::{Script, Format};
use std::io::stdout;
struct Findings { pub struct Findings {
file: PathBuf, file: PathBuf,
valid: bool, valid: bool,
mime: Mime, mime: Mime,
@ -41,11 +46,6 @@ impl Findings {
} }
} }
enum ScanError {
File,
Mime
}
impl Display for ScanError { impl Display for ScanError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", write!(f, "{}",
@ -57,14 +57,15 @@ impl Display for ScanError {
} }
} }
// TODO: test if this actually works on a windows machine // TODO: test if this actually works on a windows machine - not there's much of a point right now, considering
// xdg-mime-rs doesn't support windows
#[cfg(windows)] #[cfg(windows)]
fn is_hidden(entry: &DirEntry) -> bool { fn is_hidden(entry: &DirEntry) -> bool {
use std::os::windows::prelude::*; use std::os::windows::prelude::*;
std::fs::metadata(entry) // try to get metadata for file std::fs::metadata(entry) // try to get metadata for file
.map_or( .map_or(
false, // if getting metadata/attributes fails, assume it's not hidden false, // if getting metadata/attributes fails, assume it's not hidden
|f| f.file_attributes() & 0x2 > 0 // flag for hidden - https://docs.microsoft.com/en-us/windows/win32/fileio/file-attribute-constants |f| f.file_attributes() & 0x2 > 0 // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants
) )
} }
@ -102,9 +103,9 @@ fn extension_from_path(path: &Path) -> Option<String> {
fn scan_file(db: &SharedMimeInfo, entry: &DirEntry) -> Result<Findings, (ScanError, PathBuf)> { fn scan_file(db: &SharedMimeInfo, entry: &DirEntry) -> Result<Findings, (ScanError, PathBuf)> {
// try to determine mimetype for this entry // try to determine mimetype for this entry
let result = inspectors::mime_type(&db, entry.path()); let result = inspectors::mime_type(db, entry.path());
if let Err(_) = result { if result.is_err() {
// an error occurred while trying to read the file // an error occurred while trying to read the file
// error!("{}: {}", entry.path().to_string_lossy(), error); // error!("{}: {}", entry.path().to_string_lossy(), error);
return Err((ScanError::File, entry.path().to_path_buf())); return Err((ScanError::File, entry.path().to_path_buf()));
@ -138,16 +139,15 @@ fn scan_file(db: &SharedMimeInfo, entry: &DirEntry) -> Result<Findings, (ScanErr
valid, // make this a function valid, // make this a function
mime: result, mime: result,
}) })
} }
fn scan_from_walkdir(db: &SharedMimeInfo, entries: Vec<DirEntry>) -> Vec<Result<Findings, (ScanError, PathBuf)>> { fn scan_from_walkdir(db: &SharedMimeInfo, entries: Vec<DirEntry>) -> Vec<Result<Findings, (ScanError, PathBuf)>> {
#[cfg(feature = "multi-threaded")] { #[cfg(feature = "multi-threaded")] {
// rather than using a standard par_iter, split the entries into chunks of 16 first. // rather than using a standard par_iter, split the entries into chunks of 32 first.
// this allows each spawned thread to handle 16 files before before closing, rather than creating a new thread for // this allows each spawned thread to handle 16 files before before closing, rather than creating a new thread for
// each file. this leads to a pretty substantial speedup that i'm pretty substantially happy about 0u0 // each file. this leads to a pretty substantial speedup that i'm pretty substantially happy about 0u0
entries entries
.par_chunks(16) // split into chunks of 16 .par_chunks(32) // split into chunks of 32
.flat_map(|chunk| chunk // return Vec<...> instead of Chunk<Vec<...>> .flat_map(|chunk| chunk // return Vec<...> instead of Chunk<Vec<...>>
.iter() // iter over the chunk, which is a slice of DirEntry structs .iter() // iter over the chunk, which is a slice of DirEntry structs
.map(|entry| scan_file(db, entry)) .map(|entry| scan_file(db, entry))
@ -171,7 +171,7 @@ fn main() {
// .format(|buf, r| writeln!(buf, "{} - {}", r.level(), r.args())) // .format(|buf, r| writeln!(buf, "{} - {}", r.level(), r.args()))
.format_module_path(false) // don't include module in logs, as it's not necessary .format_module_path(false) // don't include module in logs, as it's not necessary
.format_timestamp(None) // don't include timestamps (unnecessary, and the feature flag is disabled anyway) .format_timestamp(None) // don't include timestamps (unnecessary, and the feature flag is disabled anyway)
.target(env_logger::Target::Stdout) // log to stdout rather than stderr // .target(env_logger::Target::Stdout) // log to stdout rather than stderr
.init(); .init();
let db = xdg_mime::SharedMimeInfo::new(); let db = xdg_mime::SharedMimeInfo::new();
@ -188,7 +188,7 @@ fn main() {
let results = scan_from_walkdir(&db, entries); let results = scan_from_walkdir(&db, entries);
for result in results { for result in &results {
match result { match result {
Ok(r) => { Ok(r) => {
if !r.valid { if !r.valid {
@ -201,5 +201,13 @@ fn main() {
} }
} }
match args.output_format {
OutputFormat::Script => {
let s = Script::new();
s.write_all(&results, &mut stdout().lock()).expect("failed to ouptput");
},
OutputFormat::Text => debug!("eewr")
}
debug!("Done"); debug!("Done");
} }

View file

@ -1,20 +1,26 @@
use clap::Clap; use clap::{Clap};
use std::path::PathBuf; use std::path::PathBuf;
use smartstring::alias::String; use smartstring::{LazyCompact, SmartString};
#[derive(Clap, PartialEq, Debug)]
pub enum OutputFormat {
Script,
Text
}
#[derive(Clap, Debug)] #[derive(Clap, Debug)]
pub struct Parameters { pub struct Parameters {
/// Only examine files with these extensions (Comma-separated list) /// Only examine files with these extensions (Comma-separated list)
#[clap(short, long, use_delimiter = true)] #[clap(short, long, use_delimiter = true)]
pub extensions: Option<Vec<String>>, pub extensions: Option<Vec<SmartString<LazyCompact>>>,
/// Don't skip hidden files and directories /// Don't skip hidden files and directories
#[clap(short, long)] #[clap(short, long)]
pub scan_hidden: bool, pub scan_hidden: bool,
/// Output format to use. See "--help formats" for more information. /// Output format to use. See "--help formats" for more information.
#[clap(short, long, default_value="script", possible_values = &["script", "text"])] #[clap(short, long, default_value="script", arg_enum)]
pub output_format: String, pub output_format: OutputFormat,
/// Directory to process /// Directory to process
// TODO: right now this can only take a single directory - should this be improved? // TODO: right now this can only take a single directory - should this be improved?

4
src/scanerror.rs Normal file
View file

@ -0,0 +1,4 @@
pub enum ScanError {
File,
Mime
}