rudimentary script output, lots of small stuff, performance improvements

This commit is contained in:
Lynne Megido 2021-02-10 19:20:22 +10:00
parent 9be33cd90f
commit 82bdbebec5
Signed by: lynnesbian
GPG Key ID: F0A184B5213D9F90
6 changed files with 155 additions and 172 deletions

175
Cargo.lock generated
View File

@ -15,17 +15,6 @@ dependencies = [
"event-listener",
]
[[package]]
name = "async-trait"
version = "0.1.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8d3a45e77e34375a7923b1e8febb049bb011f064714a8e17a1a616fef01da13d"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "atty"
version = "0.2.14"
@ -56,10 +45,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e2afe73808fbaac302e39c9754bfc3c4b4d0f99c9c240b9f4e4efc841ad1b74"
dependencies = [
"async-mutex",
"async-trait",
"cached_proc_macro",
"cached_proc_macro_types",
"futures",
"hashbrown",
"once_cell",
]
@ -270,6 +257,7 @@ dependencies = [
"mime_guess",
"rayon",
"smartstring",
"snailquote",
"walkdir",
"xdg-mime",
]
@ -280,101 +268,6 @@ version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "futures"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da9052a1a50244d8d5aa9bf55cbc2fb6f357c86cc52e46c62ed390a7180cf150"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-channel"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846"
dependencies = [
"futures-core",
"futures-sink",
]
[[package]]
name = "futures-core"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65"
[[package]]
name = "futures-executor"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9e59fdc009a4b3096bf94f740a0f2424c082521f20a9b08c5c07c48d90fd9b9"
dependencies = [
"futures-core",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-io"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500"
[[package]]
name = "futures-macro"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd"
dependencies = [
"proc-macro-hack",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "futures-sink"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6"
[[package]]
name = "futures-task"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86"
dependencies = [
"once_cell",
]
[[package]]
name = "futures-util"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b"
dependencies = [
"futures-channel",
"futures-core",
"futures-io",
"futures-macro",
"futures-sink",
"futures-task",
"memchr",
"pin-project-lite",
"pin-utils",
"proc-macro-hack",
"proc-macro-nested",
"slab",
]
[[package]]
name = "getrandom"
version = "0.2.2"
@ -530,18 +423,6 @@ version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "afb2e1c3ee07430c2cf76151675e583e0f19985fa6efae47d6848a3e2c824f85"
[[package]]
name = "pin-project-lite"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439697af366c49a6d0a010c56a0d97685bc140ce0d377b13a2ea2aa42d64a827"
[[package]]
name = "pin-utils"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "proc-macro-error"
version = "1.0.4"
@ -566,18 +447,6 @@ dependencies = [
"version_check",
]
[[package]]
name = "proc-macro-hack"
version = "0.5.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"
[[package]]
name = "proc-macro-nested"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086"
[[package]]
name = "proc-macro2"
version = "1.0.24"
@ -661,12 +530,6 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "slab"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8"
[[package]]
name = "smartstring"
version = "0.2.6"
@ -676,6 +539,16 @@ dependencies = [
"static_assertions",
]
[[package]]
name = "snailquote"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f34b729d802f52194598858ac852c3fb3b33f6e026cd03195072ccb7bf3fc810"
dependencies = [
"thiserror",
"unicode_categories",
]
[[package]]
name = "static_assertions"
version = "1.1.0"
@ -734,6 +607,26 @@ dependencies = [
"unicode-width",
]
[[package]]
name = "thiserror"
version = "1.0.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76cc616c6abf8c8928e2fdcc0dbfab37175edd8fb49a4641066ad1364fdab146"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9be73a2caec27583d0046ef3796c3794f868a5bc813db689eed00c7631275cd1"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "unicase"
version = "2.6.0"
@ -761,6 +654,12 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
[[package]]
name = "unicode_categories"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
[[package]]
name = "vec_map"
version = "0.8.2"

View File

@ -19,7 +19,7 @@ smartstring = "0.2.6"
xdg-mime = {git = "https://github.com/ebassi/xdg-mime-rs", version = "0.3"}
mime_guess = "2.0.3"
rayon = "1.5.0"
cached = "0.23.0"
snailquote = "0.3.0"
[dependencies.clap]
version = "3.0.0-beta.2"
@ -30,5 +30,15 @@ version = "0.8.2"
default-features = false
features = ["termcolor", "atty"]
[dependencies.cached]
version = "0.23.0"
default-features = false
features = ["proc_macro"]
[profile.release]
lto = "thin"
# optimise dependencies, even when producing debug builds
[profile.dev.package."*"]
opt-level = 3

View File

@ -1,23 +1,79 @@
use std::fmt;
use std::fmt::Formatter;
use std::io;
use std::io::Write;
use crate::Findings;
use crate::scanerror::ScanError;
use std::path::PathBuf;
use snailquote::escape;
trait Format {
fn rename(f: &mut fmt::Formatter<'_>, from: &str, to: &str) -> fmt::Result;
fn unreadable(f: &mut fmt::Formatter<'_>, path: &str) -> fmt::Result;
fn unknown_type(f: &mut fmt::Formatter<'_>, path: &str) -> fmt::Result;
type Entries = [Result<Findings, (ScanError, PathBuf)>];
pub trait Format {
fn new() -> Self;
fn rename<T: Write>(&self, f: &mut T, from: &str, to: &str) -> io::Result<()>;
fn no_known_extension<T: Write>(&self, f: &mut T, path: &str) -> io::Result<()>;
fn unreadable<T: Write>(&self, f: &mut T, path: &str) -> io::Result<()>;
fn unknown_type<T: Write>(&self, f: &mut T, path: &str) -> io::Result<()>;
fn write_all<T: Write>(&self, entries: &Entries, f: &mut T) -> io::Result<()> {
// TODO: clean this up - it's horrifying
for entry in entries {
match entry {
Ok(finding) => {
// the file was successfully scanned, and a mimetype was detected
if !finding.valid {
// the file's extension is wrong!
match finding.recommended_extension() {
Some(ext) => {
// there's a known extension for this mimetype!!
self.rename(
f,
&finding.file.to_string_lossy(),
&finding.file.with_extension(ext.as_str()).to_string_lossy()
)?
}
None => {
// unfortunately, there's no known extension for this mimetype :(
self.no_known_extension(f, &finding.file.to_string_lossy())?
}
}
}
}
Err(error) => {
// something went wrong 0uo
match error.0 {
// failed to read the file
ScanError::File => self.unreadable(f, &error.1.to_string_lossy())?,
// file was read successfully, but we couldn't determine a mimetype
ScanError::Mime => self.unknown_type(f, &error.1.to_string_lossy())?
}
}
}
}
Ok(())
}
}
struct Script {}
pub struct Script {}
impl Format for Script {
fn rename(f: &mut Formatter<'_>, from: &str, to: &str) -> fmt::Result {
write!(f, "mv {} {}", from, to)
fn new() -> Self {
return Script {}
}
fn unreadable(f: &mut Formatter<'_>, path: &str) -> fmt::Result {
write!(f, "# Failed to read {}", path)
fn rename<T: Write>(&self, f: &mut T, from: &str, to: &str) -> io::Result<()> {
// TODO: string escaping aaaaaaAAAAAAAAAA
writeln!(f, "mv -v -i -- {} {}", escape(from), escape(to))
}
fn unknown_type(f: &mut Formatter<'_>, path: &str) -> fmt::Result {
write!(f, "# Failed to detect mime type for {}", path)
fn no_known_extension<T: Write>(&self, f: &mut T, path: &str) -> io::Result<()> {
writeln!(f, "echo No known extension for {}!", escape(path))
}
fn unreadable<T: Write>(&self, f: &mut T, path: &str) -> io::Result<()> {
writeln!(f, "# Failed to read {}", escape(path))
}
fn unknown_type<T: Write>(&self, f: &mut T, path: &str) -> io::Result<()> {
writeln!(f, "# Failed to detect mime type for {}", escape(path))
}
}

View File

@ -17,6 +17,7 @@
mod parameters;
mod inspectors;
mod formats;
mod scanerror;
use std::path::{Path, PathBuf};
use walkdir::{WalkDir, DirEntry};
@ -27,8 +28,12 @@ use log::{debug, trace, info, warn};
use rayon::prelude::*;
use std::fmt::{self, Display};
use xdg_mime::SharedMimeInfo;
use crate::parameters::OutputFormat;
use crate::scanerror::ScanError;
use crate::formats::{Script, Format};
use std::io::stdout;
struct Findings {
pub struct Findings {
file: PathBuf,
valid: bool,
mime: Mime,
@ -41,11 +46,6 @@ impl Findings {
}
}
enum ScanError {
File,
Mime
}
impl Display for ScanError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}",
@ -57,14 +57,15 @@ impl Display for ScanError {
}
}
// TODO: test if this actually works on a windows machine
// TODO: test if this actually works on a windows machine - not there's much of a point right now, considering
// xdg-mime-rs doesn't support windows
#[cfg(windows)]
fn is_hidden(entry: &DirEntry) -> bool {
use std::os::windows::prelude::*;
std::fs::metadata(entry) // try to get metadata for file
.map_or(
false, // if getting metadata/attributes fails, assume it's not hidden
|f| f.file_attributes() & 0x2 > 0 // flag for hidden - https://docs.microsoft.com/en-us/windows/win32/fileio/file-attribute-constants
|f| f.file_attributes() & 0x2 > 0 // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants
)
}
@ -102,9 +103,9 @@ fn extension_from_path(path: &Path) -> Option<String> {
fn scan_file(db: &SharedMimeInfo, entry: &DirEntry) -> Result<Findings, (ScanError, PathBuf)> {
// try to determine mimetype for this entry
let result = inspectors::mime_type(&db, entry.path());
let result = inspectors::mime_type(db, entry.path());
if let Err(_) = result {
if result.is_err() {
// an error occurred while trying to read the file
// error!("{}: {}", entry.path().to_string_lossy(), error);
return Err((ScanError::File, entry.path().to_path_buf()));
@ -138,16 +139,15 @@ fn scan_file(db: &SharedMimeInfo, entry: &DirEntry) -> Result<Findings, (ScanErr
valid, // make this a function
mime: result,
})
}
fn scan_from_walkdir(db: &SharedMimeInfo, entries: Vec<DirEntry>) -> Vec<Result<Findings, (ScanError, PathBuf)>> {
#[cfg(feature = "multi-threaded")] {
// rather than using a standard par_iter, split the entries into chunks of 16 first.
// rather than using a standard par_iter, split the entries into chunks of 32 first.
// this allows each spawned thread to handle 16 files before before closing, rather than creating a new thread for
// each file. this leads to a pretty substantial speedup that i'm pretty substantially happy about 0u0
entries
.par_chunks(16) // split into chunks of 16
.par_chunks(32) // split into chunks of 32
.flat_map(|chunk| chunk // return Vec<...> instead of Chunk<Vec<...>>
.iter() // iter over the chunk, which is a slice of DirEntry structs
.map(|entry| scan_file(db, entry))
@ -171,7 +171,7 @@ fn main() {
// .format(|buf, r| writeln!(buf, "{} - {}", r.level(), r.args()))
.format_module_path(false) // don't include module in logs, as it's not necessary
.format_timestamp(None) // don't include timestamps (unnecessary, and the feature flag is disabled anyway)
.target(env_logger::Target::Stdout) // log to stdout rather than stderr
// .target(env_logger::Target::Stdout) // log to stdout rather than stderr
.init();
let db = xdg_mime::SharedMimeInfo::new();
@ -188,7 +188,7 @@ fn main() {
let results = scan_from_walkdir(&db, entries);
for result in results {
for result in &results {
match result {
Ok(r) => {
if !r.valid {
@ -201,5 +201,13 @@ fn main() {
}
}
match args.output_format {
OutputFormat::Script => {
let s = Script::new();
s.write_all(&results, &mut stdout().lock()).expect("failed to ouptput");
},
OutputFormat::Text => debug!("eewr")
}
debug!("Done");
}

View File

@ -1,20 +1,26 @@
use clap::Clap;
use clap::{Clap};
use std::path::PathBuf;
use smartstring::alias::String;
use smartstring::{LazyCompact, SmartString};
#[derive(Clap, PartialEq, Debug)]
pub enum OutputFormat {
Script,
Text
}
#[derive(Clap, Debug)]
pub struct Parameters {
/// Only examine files with these extensions (Comma-separated list)
#[clap(short, long, use_delimiter = true)]
pub extensions: Option<Vec<String>>,
pub extensions: Option<Vec<SmartString<LazyCompact>>>,
/// Don't skip hidden files and directories
#[clap(short, long)]
pub scan_hidden: bool,
/// Output format to use. See "--help formats" for more information.
#[clap(short, long, default_value="script", possible_values = &["script", "text"])]
pub output_format: String,
#[clap(short, long, default_value="script", arg_enum)]
pub output_format: OutputFormat,
/// Directory to process
// TODO: right now this can only take a single directory - should this be improved?

4
src/scanerror.rs Normal file
View File

@ -0,0 +1,4 @@
pub enum ScanError {
File,
Mime
}