diff --git a/.gitignore b/.gitignore index 1fe5795..7744931 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ fif_* *.sh !clippy.sh cargo-timing*.html +todo.txt diff --git a/Cargo.lock b/Cargo.lock index f3c0d3d..8806fb6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -176,7 +176,7 @@ checksum = "de853764b47027c2e862a995c34978ffa63c1501f2e15f987ba11bd4f9bba193" [[package]] name = "fif" -version = "0.2.3+hotfix" +version = "0.2.4" dependencies = [ "cached", "clap", diff --git a/Cargo.toml b/Cargo.toml index 398af60..2c334a0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "fif" description = "A command-line tool for detecting and optionally correcting files with incorrect extensions." -version = "0.2.3+hotfix" +version = "0.2.4" authors = ["Lynnesbian "] edition = "2018" license = "GPL-3.0-or-later" diff --git a/src/extensionset.rs b/src/extensionset.rs index 01e2db4..47667fe 100644 --- a/src/extensionset.rs +++ b/src/extensionset.rs @@ -16,13 +16,17 @@ impl ExtensionSet { Self::Images => mime_guess::get_mime_extensions_str("image/*"), Self::Videos => mime_guess::get_mime_extensions_str("video/*"), Self::Audio => mime_guess::get_mime_extensions_str("audio/*"), - Self::Documents => Some(&[ - "doc", "docx", "ppt", "pptx", "xls", "xlsx", "csv", "tsv", "pdf", "odt", "ods", "odp", - ][..]), + Self::Documents => Some( + &[ + "pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx", "csv", "tsv", "odt", "ods", "odp", "oda", "rtf", "ps", + ][..], + ), // many compressed file types follow the name scheme "application/x.+compressed.*" - maybe this can be used // somehow to extract extensions for compressed files from mime_guess? Self::Archives => Some(&["zip", "tar", "gz", "zst", "xz", "rar", "7z", "bz", "bz2"][..]), _ => todo!(), - }.unwrap().to_vec() + } + .unwrap() + .to_vec() } } diff --git a/src/formats.rs b/src/formats.rs index c66074d..8889db1 100644 --- a/src/formats.rs +++ b/src/formats.rs @@ -37,7 +37,7 @@ fn smart_write(f: &mut W, writeables: &[Writable]) -> io::Result<()> { for writeable in writeables { match writeable { Writable::Space => write!(f, " ")?, - Writable::Newline => writeln!(f, )?, + Writable::Newline => writeln!(f,)?, Writable::String(s) => write!(f, "{}", s)?, Writable::Path(path) => { if let Some(string) = path.to_str() { @@ -45,10 +45,10 @@ fn smart_write(f: &mut W, writeables: &[Writable]) -> io::Result<()> { } else { write!(f, "'''")?; #[cfg(unix)] - f.write_all(&*path.as_os_str().as_bytes())?; + f.write_all(&*path.as_os_str().as_bytes())?; #[cfg(windows)] write!(f, "{}", path.as_os_str().to_string_lossy())?; // TODO: implement bonked strings for windows - // f.write_all(&*path.as_os_str().encode_wide().collect::>())?; + // f.write_all(&*path.as_os_str().encode_wide().collect::>())?; write!(f, "'''")? } } @@ -105,37 +105,38 @@ impl Format for Script { } fn rename(&self, f: &mut W, from: &PathBuf, to: &PathBuf) -> io::Result<()> { - smart_write(f, &[ - "mv -v -i -- ".into(), - from.into(), - Writable::Space, - to.into(), - Writable::Newline - ]) + smart_write( + f, + &[ + "mv -v -i -- ".into(), + from.into(), + Writable::Space, + to.into(), + Writable::Newline, + ], + ) } fn no_known_extension(&self, f: &mut W, path: &PathBuf) -> io::Result<()> { - smart_write(f, &[ - "echo No known extension for ".into(), - path.into(), - Writable::Newline - ]) + smart_write( + f, + &["echo No known extension for ".into(), path.into(), Writable::Newline], + ) } fn unreadable(&self, f: &mut W, path: &PathBuf) -> io::Result<()> { - smart_write(f, &[ - "# Failed to read ".into(), - path.into(), - Writable::Newline - ]) + smart_write(f, &["# Failed to read ".into(), path.into(), Writable::Newline]) } fn unknown_type(&self, f: &mut W, path: &PathBuf) -> io::Result<()> { - smart_write(f, &[ - "# Failed to detect mime type for ".into(), - path.into(), - Writable::Newline - ]) + smart_write( + f, + &[ + "# Failed to detect mime type for ".into(), + path.into(), + Writable::Newline, + ], + ) } fn header(&self, _: &Entries, f: &mut W) -> io::Result<()> { diff --git a/src/inspectors.rs b/src/inspectors.rs index 98a93ce..b741e43 100644 --- a/src/inspectors.rs +++ b/src/inspectors.rs @@ -12,22 +12,12 @@ use crate::mimedb::MimeDb; // use log::{debug, warn}; -// from looking at the files in https://github.com/bojand/infer/tree/master/src/matchers, the format with the largest -// buffer size requirement for identification requires 262 bytes, and the next largest buffer necessary is only 131 -// bytes. as only two formats need more than 128 bytes, it would be fairly reasonable to only read 128 bytes. -// unfortunately, the format that requires 262 bytes for identification is tar, an extremely popular format (in the *nix -// world, at least). however, tar files almost always appear wrapped in other formats (.tar.gz, .tar.zst, etc) anyway, -// so maybe it's fine...? maybe this should be configurable by the user? i don't know. -// empirical testing (or rather, starting from 256 and incrementing until it worked) reveals that xdg-mime requires -// at least 265 bytes to identify a tar file. +// rather than reading once into a large buffer, it tends to be faster to first try identifying the file from a small +// chunk read from the top, and *then* proceeding with the large buffer. many file formats can be easily identified by +// the first 128 bytes. of course, not all formats can, and some (OOXML...) require reading a long ways in. -// additionally, since many formats can by identified with ≤64 bytes, it's worth reading 64 bytes, checking for the mime -// type, and then reading the full 512 bytes if necessary. in most cases, this will end up being faster on the whole, -// even though two reads are needed for certain formats, unless the directory being scanned is predominantly made up of -// such formats. - -const INITIAL_BUF_SIZE: usize = 64; -const BUF_SIZE: usize = 512; +const INITIAL_BUF_SIZE: usize = 128; +const BUF_SIZE: usize = 4096; pub fn mime_type(db: &T, path: &Path) -> io::Result> { let mut buffer = [0; INITIAL_BUF_SIZE]; @@ -38,19 +28,27 @@ pub fn mime_type(db: &T, path: &Path) -> io::Result> { #[allow(clippy::unused_io_amount)] file.read(&mut buffer)?; - let r = db.get_type(&buffer); + let r = db.get_type(&buffer).filter(|mime| + // some mime types should be investigated further, reading up to BUF_SIZE even if they've been determined already + // one such type is XML - there's many more specific types that can be determined by reading further (such as SVG) + mime != &mime_guess::mime::TEXT_XML + // another is ZIP - many file formats (DOCX, ODT, JAR...) are just ZIP files with particular data structures. + // determining that a file is in one of the MS office formats in particular requires looking quite far into the + // file. + && mime != &Mime::from_str("application/zip").unwrap()); if r.is_some() { return Ok(r); } // attempt to read up to the BUF_SIZE bytes of the file. - // we've already read the first 64 bytes into a buffer, but i can't see an obvious way to reuse those 64 bytes that's - // faster than simply moving the seek position back to the start of the file and re-reading the whole 512 bytes. - // for example, starting with a buffer of 64 bytes, then creating a new 512 byte buffer from the contents of the first - // buffer with (512 - 64) blank bytes, then finally reading the rest, is much slower than simply reading the file - // twice. i don't at all doubt that there IS a way to do this efficiently, and i can think of a way in principle, but - // i'm not sure how to express it in a way that is both idiomatic/safe and fast. + // we've already read the first 128 bytes into a buffer, but i can't see an obvious way to reuse them in a way that's + // faster than simply moving the seek position back to the start of the file and re-reading the whole BUF_SIZE bytes. + // for example, reading (BUF_SIZE - INITIAL_BUF_SIZE) bytes, then concatenating the original INITIAL_BUF_SIZE buffer + // with this new one would presumably be faster - but it's not. i think it's more expensive to create the iterator, + // collect the contents, etc. i'll have to look into this more. i don't at all doubt that there IS a way to do this + // efficiently, and i can think of a way in principle, but i'm not sure how to express it in a way that is both + // idiomatic/safe and fast. let mut buffer = [0; BUF_SIZE]; file.seek(SeekFrom::Start(0))?; file.read(&mut buffer)?; @@ -86,19 +84,9 @@ cached! { // to have valid extensions. [vec![String::from("xml"), String::from("svg")], possible_exts].concat() - } else if mime == Mime::from_str("application/zip").unwrap() { - // until proper document support is added, treat all common document extensions as zips - [vec![ - String::from("zip"), - String::from("docx"), - String::from("pptx"), - String::from("xlsx"), - String::from("odt"), - String::from("ods"), - String::from("odp"), - String::from("pages"), - String::from("key"), - String::from("numbers")], possible_exts].concat() + } else if mime == Mime::from_str("application/msword").unwrap() { + // classic office files considered harmful + vec![String::from("doc"), String::from("xls"), String::from("ppt")] } else { possible_exts }) diff --git a/src/main.rs b/src/main.rs index 657800d..4a0b369 100644 --- a/src/main.rs +++ b/src/main.rs @@ -253,7 +253,8 @@ fn main() { } if results.is_empty() { - info!("All files have valid extensions!") + info!("All files have valid extensions!"); + exit(0); } match args.output_format { diff --git a/src/mimedb.rs b/src/mimedb.rs index 545b7ce..f7f36ba 100644 --- a/src/mimedb.rs +++ b/src/mimedb.rs @@ -13,6 +13,13 @@ pub struct InferDb { db: infer::Infer, } +fn open_document_check(buf: &[u8], kind: &str) -> bool { + let mime = format!("application/vnd.oasis.opendocument.{}", kind); + let mime = mime.as_bytes(); + + buf.len() > 38 + mime.len() && buf.starts_with(b"PK\x03\x04") && buf[38..mime.len() + 38] == mime[..] +} + #[cfg(feature = "infer-backend")] impl MimeDb for InferDb { fn init() -> Self { @@ -23,6 +30,18 @@ impl MimeDb for InferDb { buf.len() > 23 && buf[..23] == b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A\x6A\x70\x32\x20"[..] }); + info.add("application/vnd.oasis.opendocument.text", "odt", |buf| { + open_document_check(buf, "text") + }); + + info.add("application/vnd.oasis.opendocument.spreadsheet", "ods", |buf| { + open_document_check(buf, "spreadsheet") + }); + + info.add("application/vnd.oasis.opendocument.presentation", "odp", |buf| { + open_document_check(buf, "presentation") + }); + info.add("image/svg+xml", "svg", |buf| { // before doing the moderately expensive SVG check, we should make sure that the input is actually SGML-ish // by "SGML-ish", i mean starts with anywhere from zero to ∞-1 whitespace characters, and then a less than sign, diff --git a/src/parameters.rs b/src/parameters.rs index 7f29d26..481419f 100644 --- a/src/parameters.rs +++ b/src/parameters.rs @@ -11,6 +11,7 @@ pub enum OutputFormat { } #[derive(Clap, Debug)] +#[clap(version = option_env!("CARGO_PKG_VERSION").unwrap_or("???"))] pub struct Parameters { /// Only examine files with these extensions (Comma-separated list) #[clap(