Compare commits

..

No commits in common. "19038d349357e4780c3d9920a446a66752a158f8" and "dc2f642171f812801f5d42d09ddb6132441f34b8" have entirely different histories.

13 changed files with 126 additions and 377 deletions

2
.gitignore vendored
View file

@ -2,6 +2,6 @@
/imgs /imgs
fif_* fif_*
/old /old
/awful
*.sh *.sh
!clippy.sh
cargo-timing*.html cargo-timing*.html

94
Cargo.lock generated
View file

@ -39,12 +39,6 @@ dependencies = [
"once_cell", "once_cell",
] ]
[[package]]
name = "cc"
version = "1.0.67"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3c69b077ad434294d3ce9f1f6143a2a4b89a8a2d54ef813d85003a4fd1137fd"
[[package]] [[package]]
name = "cfg-if" name = "cfg-if"
version = "1.0.0" version = "1.0.0"
@ -83,6 +77,12 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "const_fn"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28b9d6de7f49e22cf97ad17fc4036ece69300032f45f78f30b4a4482cdc3f4a6"
[[package]] [[package]]
name = "crossbeam-channel" name = "crossbeam-channel"
version = "0.5.0" version = "0.5.0"
@ -106,28 +106,27 @@ dependencies = [
[[package]] [[package]]
name = "crossbeam-epoch" name = "crossbeam-epoch"
version = "0.9.2" version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d60ab4a8dba064f2fbb5aa270c28da5cf4bbd0e72dae1140a6b0353a779dbe00" checksum = "a1aaa739f95311c2c7887a76863f500026092fb1dce0161dab577e559ef3569d"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"const_fn",
"crossbeam-utils", "crossbeam-utils",
"lazy_static", "lazy_static",
"loom",
"memoffset", "memoffset",
"scopeguard", "scopeguard",
] ]
[[package]] [[package]]
name = "crossbeam-utils" name = "crossbeam-utils"
version = "0.8.2" version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bae8f328835f8f5a6ceb6a7842a7f2d0c03692adb5c889347235d59194731fe3" checksum = "02d96d1e189ef58269ebe5b97953da3274d83a93af647c2ddd6f9dab28cedb8d"
dependencies = [ dependencies = [
"autocfg", "autocfg",
"cfg-if", "cfg-if",
"lazy_static", "lazy_static",
"loom",
] ]
[[package]] [[package]]
@ -168,20 +167,13 @@ dependencies = [
"termcolor", "termcolor",
] ]
[[package]]
name = "exitcode"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de853764b47027c2e862a995c34978ffa63c1501f2e15f987ba11bd4f9bba193"
[[package]] [[package]]
name = "fif" name = "fif"
version = "0.2.3" version = "0.2.1"
dependencies = [ dependencies = [
"cached", "cached",
"clap", "clap",
"env_logger", "env_logger",
"exitcode",
"infer", "infer",
"log", "log",
"mime_guess", "mime_guess",
@ -193,19 +185,6 @@ dependencies = [
"xdg-mime", "xdg-mime",
] ]
[[package]]
name = "generator"
version = "0.6.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8cdc09201b2e8ca1b19290cf7e65de2246b8e91fb6874279722189c4de7b94dc"
dependencies = [
"cc",
"libc",
"log",
"rustc_version",
"winapi",
]
[[package]] [[package]]
name = "getrandom" name = "getrandom"
version = "0.2.2" version = "0.2.2"
@ -297,17 +276,6 @@ dependencies = [
"cfg-if", "cfg-if",
] ]
[[package]]
name = "loom"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d44c73b4636e497b4917eb21c33539efa3816741a2d3ff26c6316f1b529481a4"
dependencies = [
"cfg-if",
"generator",
"scoped-tls",
]
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.3.4" version = "2.3.4"
@ -458,15 +426,6 @@ dependencies = [
"redox_syscall", "redox_syscall",
] ]
[[package]]
name = "rustc_version"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
dependencies = [
"semver",
]
[[package]] [[package]]
name = "ryu" name = "ryu"
version = "1.0.5" version = "1.0.5"
@ -482,33 +441,12 @@ dependencies = [
"winapi-util", "winapi-util",
] ]
[[package]]
name = "scoped-tls"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2"
[[package]] [[package]]
name = "scopeguard" name = "scopeguard"
version = "1.1.0" version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "semver"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
dependencies = [
"semver-parser",
]
[[package]]
name = "semver-parser"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
[[package]] [[package]]
name = "smartstring" name = "smartstring"
version = "0.2.6" version = "0.2.6"
@ -576,18 +514,18 @@ dependencies = [
[[package]] [[package]]
name = "thiserror" name = "thiserror"
version = "1.0.24" version = "1.0.23"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e0f4a65597094d4483ddaed134f409b2cb7c1beccf25201a9f73c719254fa98e" checksum = "76cc616c6abf8c8928e2fdcc0dbfab37175edd8fb49a4641066ad1364fdab146"
dependencies = [ dependencies = [
"thiserror-impl", "thiserror-impl",
] ]
[[package]] [[package]]
name = "thiserror-impl" name = "thiserror-impl"
version = "1.0.24" version = "1.0.23"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0" checksum = "9be73a2caec27583d0046ef3796c3794f868a5bc813db689eed00c7631275cd1"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",

View file

@ -1,16 +1,11 @@
[package] [package]
name = "fif" name = "fif"
description = "A command-line tool for detecting and optionally correcting files with incorrect extensions." description = "A command-line tool for detecting and optionally correcting files with incorrect extensions."
version = "0.2.3" version = "0.2.1"
authors = ["Lynnesbian <lynne@bune.city>"] authors = ["Lynnesbian <lynne@bune.city>"]
edition = "2018" edition = "2018"
license = "GPL-3.0-or-later" license = "GPL-3.0-or-later"
rust-version = "1.43.0" # cached requires 1.42.0 rust-version = "1.43.0" # cached requires 1.42.0
repository = "https://git.bune.city/lynnesbian/fif"
readme = "README.md"
keywords = ["mime", "mimetype", "utilities", "tools"]
categories = ["command-line-utilities"]
exclude = [".idea/", "Cross.toml", "*.sh"]
#resolver = "2" #resolver = "2"
#license-file = "LICENSE" #license-file = "LICENSE"
@ -29,10 +24,7 @@ snailquote = "0.3.0"
once_cell = "1.5.2" once_cell = "1.5.2"
rayon = { version = "1.5.0", optional = true } rayon = { version = "1.5.0", optional = true }
infer = { version = "0.3.4", optional = true } infer = { version = "0.3.4", optional = true }
exitcode = "1.1.2"
# use git version while waiting on a release incorporating https://github.com/ebassi/xdg-mime-rs/commit/de5a6dd # use git version while waiting on a release incorporating https://github.com/ebassi/xdg-mime-rs/commit/de5a6dd
[target.'cfg(not(target_os = "windows"))'.dependencies]
xdg-mime = {git = "https://github.com/ebassi/xdg-mime-rs", version = "0.3", rev = "de5a6dd", optional = true } xdg-mime = {git = "https://github.com/ebassi/xdg-mime-rs", version = "0.3", rev = "de5a6dd", optional = true }
[dependencies.clap] [dependencies.clap]

View file

@ -1,2 +0,0 @@
[build.env]
passthrough = ["RUST_BACKTRACE", "RUST_LOG"]

View file

@ -1,33 +0,0 @@
fif
===
A command-line tool for detecting and optionally correcting files with incorrect extensions.
## Installation
```bash
cargo install --locked fif
```
## Usage
See `fif --help` for more.
### The basics
The simplest way to use fif looks like this:
```bash
fif -E images ~/Pictures
```
This command will scan all of the files with extensions used by image files (.jpg, .png, etc) in your `~/Pictures`
directory.
You can also manually specify a set of extensions to use:
```bash
fif -e jpeg,jpg,zip,docx ~/Documents
```
By default, fif will output a bash script that can be used to fix all the files it found with incorrect file extensions.
You might find it useful to output this script to a file (rather than to stdout):
```bash
fif -E images ~/Pictures > output.sh
```

View file

@ -1,17 +0,0 @@
#!/bin/bash
fd -e rs -x touch {}
cargo clippy -- \
-W clippy::nursery \
-W clippy::perf \
-W clippy::pedantic \
-W clippy::complexity \
-W clippy::cargo \
-A clippy::unused_io_amount \
-A clippy::redundant_closure_for_method_calls \
-A clippy::shadow_unrelated \
# ALLOWS:
# unused_io_amount: there are two places where i want to read up to X bytes and i'm fine with getting less than that
# redundant_closure...: the alternative is often much more verbose
# shadow_unrelated: sometimes things that seem unrelated are actually related ;)

View file

@ -7,22 +7,18 @@ pub enum ExtensionSet {
Videos, Videos,
Media, Media,
Documents, Documents,
Archives, Archives
} }
impl ExtensionSet { impl ExtensionSet {
pub fn extensions(&self) -> Vec<&str> { pub fn extensions(&self) -> Vec<&str> {
match self { match self {
Self::Images => mime_guess::get_mime_extensions_str("image/*"), Self::Images => vec!["png", "jpg", "jpeg", "webp", "raw", "gif", "apng", "tga", "bmp", "tif", "tiff", "heif",
Self::Videos => mime_guess::get_mime_extensions_str("video/*"), "avif", "jp2", "mng", "svg"],
Self::Audio => mime_guess::get_mime_extensions_str("audio/*"), Self::Videos => vec!["webm", "mp4", "mkv", "mov", "avi", "m4v", "wmv", "bik", "ogv", "qt", "3gp", "3g2", "divx"],
Self::Documents => Some(&[ Self::Audio => vec!["ogg", "oga", "opus", "mp3", "m4a", "aac", "flac", "ape", "midi", "mid", "alac", "wav",
"doc", "docx", "ppt", "pptx", "xls", "xlsx", "csv", "tsv", "pdf", "odt", "ods", "odp", "aiff", "aa3", "at3"],
][..]), _ => todo!()
// many compressed file types follow the name scheme "application/x.+compressed.*" - maybe this can be used }
// somehow to extract extensions for compressed files from mime_guess?
Self::Archives => Some(&["zip", "tar", "gz", "zst", "xz", "rar", "7z", "bz", "bz2"][..]),
_ => todo!(),
}.unwrap().to_vec()
} }
} }

View file

@ -1,6 +1,8 @@
use std::io::{self, Write}; use std::io::{self, Write};
#[cfg(unix)] #[cfg(unix)]
use std::os::unix::ffi::OsStrExt; use std::os::unix::ffi::OsStrExt;
#[cfg(windows)]
use std::os::windows::ffi::OsStrExt;
use std::path::PathBuf; use std::path::PathBuf;
use snailquote::escape; use snailquote::escape;
@ -12,49 +14,20 @@ const VERSION: Option<&'static str> = option_env!("CARGO_PKG_VERSION");
type Entries = [Result<Findings, (ScanError, PathBuf)>]; type Entries = [Result<Findings, (ScanError, PathBuf)>];
enum Writable<'a> { fn write_pathbuf<W: Write>(f: &mut W, path: &PathBuf) -> io::Result<()> {
String(&'a str), match path.to_str() {
Path(&'a PathBuf), Some(string) => {
Space, write!(f, "{}", escape(string))
Newline,
}
// the lifetime of a lifetime
impl<'a> From<&'a str> for Writable<'a> {
fn from(s: &'a str) -> Writable<'a> {
Writable::String(s)
} }
} None => {
write!(f, "'")?;
impl<'a> From<&'a PathBuf> for Writable<'a> {
fn from(p: &'a PathBuf) -> Writable<'a> {
Writable::Path(p)
}
}
fn smart_write<W: Write>(f: &mut W, writeables: &[Writable]) -> io::Result<()> {
// ehhhh
for writeable in writeables {
match writeable {
Writable::Space => write!(f, " ")?,
Writable::Newline => writeln!(f, )?,
Writable::String(s) => write!(f, "{}", s)?,
Writable::Path(path) => {
if let Some(string) = path.to_str() {
write!(f, "{}", escape(string))?
} else {
write!(f, "'''")?;
#[cfg(unix)] #[cfg(unix)]
f.write_all(&*path.as_os_str().as_bytes())?; f.write_all(&*path.as_os_str().as_bytes())?;
#[cfg(windows)] #[cfg(windows)]
write!(f, "{}", path.as_os_str().to_string_lossy())?; // TODO: implement bonked strings for windows f.write_all(&*path.as_os_str().encode_wide().collect())?; // TODO: TEST THIS
// f.write_all(&*path.as_os_str().encode_wide().collect::<Vec<u16>>())?; write!(f, "'")
write!(f, "'''")?
} }
} }
}
}
Ok(())
} }
pub trait Format { pub trait Format {
@ -67,19 +40,22 @@ pub trait Format {
fn footer<W: Write>(&self, entries: &Entries, f: &mut W) -> io::Result<()>; fn footer<W: Write>(&self, entries: &Entries, f: &mut W) -> io::Result<()>;
fn write_all<W: Write>(&self, entries: &Entries, f: &mut W) -> io::Result<()> { fn write_all<W: Write>(&self, entries: &Entries, f: &mut W) -> io::Result<()> {
// TODO: clean this up - it's kinda messy // TODO: clean this up - it's horrifying
self.header(entries, f)?; self.header(entries, f)?;
for entry in entries { for entry in entries {
match entry { match entry {
Ok(finding) => { Ok(finding) => {
// the file was successfully scanned, and a mimetype was detected
if !finding.valid {
// the file's extension is wrong - check for known extension
if let Some(ext) = finding.recommended_extension() { if let Some(ext) = finding.recommended_extension() {
self.rename(f, &finding.file, &finding.file.with_extension(ext.as_str()))? self.rename(f, &finding.file, &finding.file.with_extension(ext.as_str()))?
} else { } else {
self.no_known_extension(f, &finding.file)? self.no_known_extension(f, &finding.file)?
} }
} }
}
Err(error) => { Err(error) => {
// something went wrong 0uo // something went wrong 0uo
match error.0 { match error.0 {
@ -105,37 +81,30 @@ impl Format for Script {
} }
fn rename<W: Write>(&self, f: &mut W, from: &PathBuf, to: &PathBuf) -> io::Result<()> { fn rename<W: Write>(&self, f: &mut W, from: &PathBuf, to: &PathBuf) -> io::Result<()> {
smart_write(f, &[ // TODO: surely there's a better way...
"mv -v -i -- ".into(), write!(f, "mv -v -i -- ")?;
from.into(), write_pathbuf(f, from)?;
Writable::Space, write!(f, " ")?;
to.into(), write_pathbuf(f, to)?;
Writable::Newline writeln!(f,)
])
} }
fn no_known_extension<W: Write>(&self, f: &mut W, path: &PathBuf) -> io::Result<()> { fn no_known_extension<W: Write>(&self, f: &mut W, path: &PathBuf) -> io::Result<()> {
smart_write(f, &[ write!(f, "printf No known extension for ")?;
"echo No known extension for ".into(), write_pathbuf(f, path)?;
path.into(), writeln!(f,"\nprintf '\n'")
Writable::Newline
])
} }
fn unreadable<W: Write>(&self, f: &mut W, path: &PathBuf) -> io::Result<()> { fn unreadable<W: Write>(&self, f: &mut W, path: &PathBuf) -> io::Result<()> {
smart_write(f, &[ write!(f, "# Failed to read ")?;
"# Failed to read ".into(), write_pathbuf(f, path)?;
path.into(), writeln!(f,)
Writable::Newline
])
} }
fn unknown_type<W: Write>(&self, f: &mut W, path: &PathBuf) -> io::Result<()> { fn unknown_type<W: Write>(&self, f: &mut W, path: &PathBuf) -> io::Result<()> {
smart_write(f, &[ write!(f, "# Failed to detect mime type for ")?;
"# Failed to detect mime type for ".into(), write_pathbuf(f, path)?;
path.into(), writeln!(f,)
Writable::Newline
])
} }
fn header<W: Write>(&self, _: &Entries, f: &mut W) -> io::Result<()> { fn header<W: Write>(&self, _: &Entries, f: &mut W) -> io::Result<()> {
@ -147,6 +116,6 @@ impl Format for Script {
} }
fn footer<W: Write>(&self, _: &Entries, f: &mut W) -> io::Result<()> { fn footer<W: Write>(&self, _: &Entries, f: &mut W) -> io::Result<()> {
writeln!(f, "\necho 'Done.'") writeln!(f, "\nprintf 'Done.\\n'")
} }
} }

View file

@ -17,19 +17,14 @@ use crate::mimedb::MimeDb;
// unfortunately, the format that requires 262 bytes for identification is tar, an extremely popular format (in the *nix // unfortunately, the format that requires 262 bytes for identification is tar, an extremely popular format (in the *nix
// world, at least). however, tar files almost always appear wrapped in other formats (.tar.gz, .tar.zst, etc) anyway, // world, at least). however, tar files almost always appear wrapped in other formats (.tar.gz, .tar.zst, etc) anyway,
// so maybe it's fine...? maybe this should be configurable by the user? i don't know. // so maybe it's fine...? maybe this should be configurable by the user? i don't know.
// empirical testing (or rather, starting from 256 and incrementing until it worked) reveals that xdg-mime requires // empirical testing (or rather, starting from 256 and incrementing until it worked) reveals that mime_type requires
// at least 265 bytes to identify a tar file. // at least 265 bytes to identify a tar file.
// additionally, since many formats can by identified with ≤64 bytes, it's worth reading 64 bytes, checking for the mime
// type, and then reading the full 512 bytes if necessary. in most cases, this will end up being faster on the whole,
// even though two reads are needed for certain formats, unless the directory being scanned is predominantly made up of
// such formats.
const INITIAL_BUF_SIZE: usize = 64;
const BUF_SIZE: usize = 512; const BUF_SIZE: usize = 512;
pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> { pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
let mut buffer = [0; INITIAL_BUF_SIZE]; // attempt to read up to the BUF_SIZE bytes of the file
let mut buffer = [0; 64];
let mut file = File::open(path)?; let mut file = File::open(path)?;
// this lint can be ignored: it's okay if the file isn't long enough to fill the buffer, as we only care about the // this lint can be ignored: it's okay if the file isn't long enough to fill the buffer, as we only care about the
@ -38,21 +33,14 @@ pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
file.read(&mut buffer)?; file.read(&mut buffer)?;
let r = db.get_type(&buffer); let r = db.get_type(&buffer);
if r.is_some() { if r.is_some() {
return Ok(r); return Ok(r);
} }
// attempt to read up to the BUF_SIZE bytes of the file.
// we've already read the first 64 bytes into a buffer, but i can't see an obvious way to reuse those 64 bytes that's
// faster than simply moving the seek position back to the start of the file and re-reading the whole 512 bytes.
// for example, starting with a buffer of 64 bytes, then creating a new 512 byte buffer from the contents of the first
// buffer with (512 - 64) blank bytes, then finally reading the rest, is much slower than simply reading the file
// twice. i don't at all doubt that there IS a way to do this efficiently, and i can think of a way in principle, but
// i'm not sure how to express it in a way that is both idiomatic/safe and fast.
let mut buffer = [0; BUF_SIZE]; let mut buffer = [0; BUF_SIZE];
file.seek(SeekFrom::Start(0))?; file.seek(SeekFrom::Start(0))?;
file.read(&mut buffer)?; file.read(&mut buffer)?;
// warn!("dang");
Ok(db.get_type(&buffer)) Ok(db.get_type(&buffer))
} }
@ -61,34 +49,13 @@ pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
cached! { cached! {
MIMEXT; MIMEXT;
fn mime_extension_lookup(mime: Mime) -> Option<Vec<String>> = { fn mime_extension_lookup(mime: Mime) -> Option<Vec<String>> = {
if mime == mime_guess::mime::IMAGE_JPEG {
// match on the mime's `essence_str` rather than the mime itself - mime_guess::get_mime_extensions ignores the type // jpeg files are given the primary extension "jpe", due to the extension list being stored in alphabetical order.
// suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. passing the essence_str // to handle this particular case, return a custom vector consisting of just "jpg" and "jpeg".
// (which includes the suffix) fixes this. return Some(vec![String::from("jpg"), String::from("jpeg")]);
match mime_guess::get_mime_extensions_str(mime.essence_str()) { }
Some(exts) => { match mime_guess::get_mime_extensions(&mime) { // get a list of possible extensions for this mime type
let possible_exts: Vec<String> = exts.iter().map(|e| String::from(*e)).collect(); Some(exts) => Some(exts.iter().map(|e| String::from(*e)).collect()),
Some(if mime == mime_guess::mime::IMAGE_JPEG {
// possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are
// far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can
// add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif.
[vec![String::from("jpg")], possible_exts].concat()
} else if mime == mime_guess::mime::TEXT_XML {
// a somewhat similar case arises with XML files - the first suggested extension is "addin", when it should
// (in my opinion) be "xml".
// there's also another problem: SVG files can easily be misidentified as XML files, because they usually
// *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read
// before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg"
// as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered
// to have valid extensions.
[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
} else {
possible_exts
})
},
None => None None => None
} }
} }

View file

@ -18,7 +18,7 @@ use std::io::{stdout, BufWriter};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use clap::Clap; use clap::Clap;
use log::{debug, error, info, trace, warn}; use log::{debug, info, trace, warn};
use once_cell::sync::OnceCell; use once_cell::sync::OnceCell;
#[cfg(feature = "multi-threaded")] #[cfg(feature = "multi-threaded")]
use rayon::prelude::*; use rayon::prelude::*;
@ -30,15 +30,14 @@ use crate::formats::{Format, Script};
use crate::mimedb::MimeDb; use crate::mimedb::MimeDb;
use crate::parameters::OutputFormat; use crate::parameters::OutputFormat;
use crate::scanerror::ScanError; use crate::scanerror::ScanError;
use std::process::exit;
mod extensionset;
mod findings; mod findings;
mod formats; mod formats;
mod inspectors; mod inspectors;
mod mimedb; mod mimedb;
mod parameters; mod parameters;
mod scanerror; mod scanerror;
mod extensionset;
#[cfg(feature = "infer-backend")] #[cfg(feature = "infer-backend")]
static MIMEDB: OnceCell<mimedb::InferDb> = OnceCell::new(); static MIMEDB: OnceCell<mimedb::InferDb> = OnceCell::new();
@ -50,7 +49,7 @@ static MIMEDB: OnceCell<mimedb::XdgDb> = OnceCell::new();
#[cfg(windows)] #[cfg(windows)]
fn is_hidden(entry: &DirEntry) -> bool { fn is_hidden(entry: &DirEntry) -> bool {
use std::os::windows::prelude::*; use std::os::windows::prelude::*;
std::fs::metadata(entry.path()) // try to get metadata for file std::fs::metadata(entry) // try to get metadata for file
.map_or( .map_or(
false, // if getting metadata/attributes fails, assume it's not hidden false, // if getting metadata/attributes fails, assume it's not hidden
|f| f.file_attributes() & 0x2 > 0, // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants |f| f.file_attributes() & 0x2 > 0, // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants
@ -117,18 +116,20 @@ fn scan_file(entry: &DirEntry) -> Result<Findings, (ScanError, PathBuf)> {
let valid = match known_exts { let valid = match known_exts {
// there is a known set of extensions for this mimetype, and the file has an extension // there is a known set of extensions for this mimetype, and the file has an extension
Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_lowercase().into()), Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_lowercase().into()),
// either this file has no extension, or there is no known set of extensions for this mimetype :( // there is a known set of extensions for this mimetype, but the file has no extension
Some(_) | None => false, Some(_) => false,
// there is no known set of extensions for this mimetype -- assume it's correct
None => true,
}; };
Ok(Findings { Ok(Findings {
file: entry.path().to_path_buf(), file: entry.path().to_path_buf(),
valid, valid, // make this a function
mime: result, mime: result,
}) })
} }
fn scan_from_walkdir(entries: &[DirEntry]) -> Vec<Result<Findings, (ScanError, PathBuf)>> { fn scan_from_walkdir(entries: Vec<DirEntry>) -> Vec<Result<Findings, (ScanError, PathBuf)>> {
#[cfg(feature = "multi-threaded")] #[cfg(feature = "multi-threaded")]
{ {
// rather than using a standard par_iter, split the entries into chunks of 32 first. // rather than using a standard par_iter, split the entries into chunks of 32 first.
@ -153,7 +154,6 @@ fn scan_from_walkdir(entries: &[DirEntry]) -> Vec<Result<Findings, (ScanError, P
fn main() { fn main() {
let args = parameters::Parameters::parse(); let args = parameters::Parameters::parse();
let mut builder = env_logger::Builder::from_default_env(); let mut builder = env_logger::Builder::from_default_env();
builder builder
// .format(|buf, r| writeln!(buf, "{} - {}", r.level(), r.args())) // .format(|buf, r| writeln!(buf, "{} - {}", r.level(), r.args()))
@ -165,19 +165,22 @@ fn main() {
#[cfg(feature = "infer-backend")] #[cfg(feature = "infer-backend")]
MIMEDB MIMEDB
.set(mimedb::InferDb::init()) .set(mimedb::InferDb::init())
.or(Err("Failed to initialise Infer backend!")) .or(Err("Failed to initialise MIMEDB"))
.unwrap(); .unwrap();
#[cfg(feature = "xdg-mime-backend")] #[cfg(feature = "xdg-mime-backend")]
MIMEDB MIMEDB
.set(mimedb::XdgDb::init()) .set(mimedb::XdgDb::init())
.or(Err("Failed to initialise XDG Mime backend!")) .or(Err("Failed to initialise MIMEDB"))
.unwrap(); .unwrap();
debug!("Iterating directory: {:?}", args.dirs); debug!("Iterating directory: {:?}", args.dirs);
let extensions: Vec<&str> = if let Some(exts) = &args.exts { let extensions: Vec<&str> = if let Some(exts) = &args.exts {
exts.iter().map(|s| s.as_str()).collect() exts
.iter()
.map(|s| s.as_str())
.collect()
} else if let Some(exts) = &args.ext_set { } else if let Some(exts) = &args.ext_set {
exts.extensions().to_vec() exts.extensions().to_vec()
} else { } else {
@ -187,81 +190,38 @@ fn main() {
debug!("Checking files with extensions: {:?}", extensions); debug!("Checking files with extensions: {:?}", extensions);
let stepper = WalkDir::new(&args.dirs).into_iter(); let stepper = WalkDir::new(&args.dirs).into_iter();
let mut probably_fatal_error = false;
let entries: Vec<DirEntry> = stepper let entries: Vec<DirEntry> = stepper
.filter_entry(|e| wanted_file(&args, &extensions, e)) // filter out unwanted files .filter_entry(|e| wanted_file(&args, &extensions, e)) // filter out unwanted files
.filter_map(|e| { .filter_map(|e| e.ok()) // ignore anything that fails, e.g. files we don't have read access on
if let Err(err) = &e {
debug!("uh oh spaghettio!! {:#?}", e);
// log errors to stdout, and remove them from the iterator
let path = err.path().map_or("General error".into(), Path::to_string_lossy);
if err.depth() == 0 {
// if something goes wrong while trying to read the root directory, we're probably not going to get much done
probably_fatal_error = true;
}
// TODO: is there a way to just say `map_or(x, |y| y).thing()` instead of `map_or(x.thing(), |y| y.thing())`?
// i don't care whether i'm returning a walkdir error or an io error, i just care about whether or not it
// implements ToString (which they both do). map_or doesn't work on trait objects though :(
error!(
"{}: {}",
path,
err.io_error().map_or(err.to_string(), |e| e.to_string())
);
return None;
}
e.ok()
})
.filter(|e| !e.file_type().is_dir()) // remove directories from the final list .filter(|e| !e.file_type().is_dir()) // remove directories from the final list
.collect(); .collect();
if entries.is_empty() {
if probably_fatal_error {
// no need to log anything for fatal errors - fif will already have printed something obvious like
// "[ERROR] /fake/path: No such file or directory (os error 2)". we can assume that if this has happened, the dir
// given as input doesn't exist or is otherwise unreadable.
exit(exitcode::NOINPUT);
}
warn!("No files matching requested options found.");
exit(exitcode::DATAERR);
}
trace!("Found {} items to check", entries.len()); trace!("Found {} items to check", entries.len());
let results: Vec<_> = scan_from_walkdir(&entries) let results = scan_from_walkdir(entries);
.into_iter()
.filter(
|result| result.is_err() || !result.as_ref().unwrap().valid,
// TODO: find a way to trace! the valid files without doing ↓
// || if result.as_ref().unwrap().valid { trace!("{:?} is fine", result.as_ref().unwrap().file); false } else { true }
)
.collect();
for result in &results { for result in &results {
match result { match result {
Ok(r) => { Ok(r) => {
if !r.valid {
info!( info!(
"{:?} should have file extension {}", "{:?} should have file extension {}",
r.file, r.file,
r.recommended_extension().unwrap_or_else(|| "???".into()) r.recommended_extension().unwrap()
) )
} else {
trace!("{:?} is totally fine", r.file)
}
} }
Err(f) => warn!("{:#?}: Error 0uo - {}", f.1, f.0), Err(f) => warn!("{:#?}: Error 0uo - {}", f.1, f.0),
} }
} }
if results.is_empty() {
info!("All files have valid extensions!")
}
match args.output_format { match args.output_format {
OutputFormat::Script => { OutputFormat::Script => {
let s = Script::new(); let s = Script::new();
if s.write_all(&results, &mut BufWriter::new(stdout().lock())).is_err() { s.write_all(&results, &mut BufWriter::new(stdout().lock()))
exit(exitcode::IOERR); .expect("failed to output");
}
} }
OutputFormat::Text => todo!(), OutputFormat::Text => todo!(),
} }

View file

@ -17,39 +17,25 @@ pub struct InferDb {
impl MimeDb for InferDb { impl MimeDb for InferDb {
fn init() -> Self { fn init() -> Self {
let mut info = infer::Infer::new(); let mut info = infer::Infer::new();
// add a random file type just to make sure adding works and such
// jpeg2000 support because why the stinch not
info.add("image/jpeg2000", ".jp2", |buf| { info.add("image/jpeg2000", ".jp2", |buf| {
buf.len() > 23 && buf[..23] == b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A\x6A\x70\x32\x20"[..] buf.len() > 23
}); && buf[0] == 0x00
&& buf[1] == 0x00
info.add("image/svg+xml", "svg", |buf| { && buf[2] == 0x00
// before doing the moderately expensive SVG check, we should make sure that the input is actually SGML-ish && buf[3] == 0x0C
// by "SGML-ish", i mean starts with anywhere from zero to ∞-1 whitespace characters, and then a less than sign, && buf[4] == 0x6A
// and then there's some other stuff we don't care about right now && buf[5] == 0x50
&& buf[6] == 0x20
// so, here comes our fancy pants """""SGML-ish validator""""" && buf[7] == 0x20
for c in buf { && buf[8] == 0x0D
match c { && buf[9] == 0x0A
// whitespace (according to https://www.w3.org/TR/xml/#NT-S) && buf[10] == 0x87
b'\t' | b'\r' | b'\n' | b'\x20' => continue, && buf[11] == 0x0A
b'<' => break, && buf[20] == 0x6A
_ => return false, && buf[21] == 0x70
} && buf[22] == 0x32
} && buf[23] == 0x20
// finally, to check whether or not the file is an SVG:
// - split the buffer up into chunks separated by the less than sign
// - check to see if this chunk starts with any of these identifiers:
let identifiers: Vec<&[u8]> = vec![b"svg", b"SVG", b"!DOCTYPE svg", b"!DOCTYPE SVG"];
// - if it does, the nested `any` will short circuit and immediately return true, causing the parent `any` to do
// the same
// - and finally, if none of the chunks match, we'll return false
// TODO: this is kind of messy, i'd like to clean it up somehow :(
buf
.split(|c| *c == b'<')
.any(|buf| identifiers.iter().any(|id| buf.starts_with(id)))
}); });
// unmut // unmut

View file

@ -1,8 +1,8 @@
use std::path::PathBuf; use std::path::PathBuf;
use crate::extensionset::ExtensionSet;
use clap::Clap; use clap::Clap;
use smartstring::{LazyCompact, SmartString}; use smartstring::{LazyCompact, SmartString};
use crate::extensionset::ExtensionSet;
#[derive(Clap, PartialEq, Debug)] #[derive(Clap, PartialEq, Debug)]
pub enum OutputFormat { pub enum OutputFormat {
@ -13,13 +13,7 @@ pub enum OutputFormat {
#[derive(Clap, Debug)] #[derive(Clap, Debug)]
pub struct Parameters { pub struct Parameters {
/// Only examine files with these extensions (Comma-separated list) /// Only examine files with these extensions (Comma-separated list)
#[clap( #[clap(short, long, use_delimiter = true, require_delimiter = true, required_unless_present = "ext-set")]
short,
long,
use_delimiter = true,
require_delimiter = true,
required_unless_present = "ext-set"
)]
pub exts: Option<Vec<SmartString<LazyCompact>>>, pub exts: Option<Vec<SmartString<LazyCompact>>>,
/// write good docs 0uo /// write good docs 0uo

View file

@ -1,6 +1,5 @@
use std::fmt::{Display, Formatter, Result}; use std::fmt::{Display, Formatter, Result};
#[derive(Debug)]
pub enum ScanError { pub enum ScanError {
File, File,
Mime, Mime,