From 9c8be183d94fe3bbb3670fe0ceb480be6f1aec90 Mon Sep 17 00:00:00 2001 From: Lynne Date: Wed, 14 Apr 2021 17:27:44 +1000 Subject: [PATCH 1/3] new release! 0u0 also i increased BUF_SIZE to 8192 bytes --- CHANGELOG.md | 14 +++++++++----- Cargo.lock | 33 +++++++++++++++++++++++++++++---- Cargo.toml | 4 ++-- src/inspectors.rs | 9 +++------ 4 files changed, 43 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0fc25ac..dfaa5ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,15 +2,19 @@ Dates are given in YYYY-MM-DD format. ## v0.2 -### v0.2.12 (2021-???) -- Much better README.md +### v0.2.12 (2021-04-14) +#### Features +- Added Text extension set - Better documentation for command line arguments +#### Bugfixes +- Fixed a very minor output bug relating to scanning symlinked directories +- Better detection for pre-OOXML Office files +#### Other +- Much better README.md - Added more stuff to test.py - PKGBUILD for Arch-based distros -- Added Text extension set - More test coverage -- Fixed a very minor output bug relating to scanning symlinked directories -- Better detection for a specific formats (pre-OOXML Office, EXE, DLL) +- Doubled BUF_SIZE ### v0.2.11 (2021-04-04) #### Features diff --git a/Cargo.lock b/Cargo.lock index 3ef6050..1b7e749 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -29,6 +29,12 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + [[package]] name = "cached" version = "0.23.0" @@ -39,6 +45,16 @@ dependencies = [ "once_cell", ] +[[package]] +name = "cfb" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca453e8624711b2f0f4eb47076a318feda166252a827ee25d067b43de83dcba0" +dependencies = [ + "byteorder", + "uuid", +] + [[package]] name = "cfg-if" version = "1.0.0" @@ -177,7 +193,7 @@ dependencies = [ [[package]] name = "fif" -version = "0.2.11" +version = "0.2.12" dependencies = [ "cached", "cfg-if", @@ -250,9 +266,12 @@ dependencies = [ [[package]] name = "infer" -version = "0.3.7" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "865e8a58ae8e24d2c4412c31344afa1d302a3740ad67528c10f50d6876cdcf55" +checksum = "f92b41dab759f9e8427c03f519c344a14655490b8db548dac1e57a75b3258391" +dependencies = [ + "cfb", +] [[package]] name = "instant" @@ -321,7 +340,7 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" [[package]] name = "mime_guess" version = "2.0.4" -source = "git+https://github.com/Lynnesbian/mime_guess#55fe99663a1b78ad5f50ffe1a9aaeb65fb2cb4ca" +source = "git+https://github.com/Lynnesbian/mime_guess#28633a9936a9c3eb29cf85e99899e35ddd63d9c1" dependencies = [ "mime", "unicase", @@ -654,6 +673,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" +[[package]] +name = "uuid" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" + [[package]] name = "vec_map" version = "0.8.2" diff --git a/Cargo.toml b/Cargo.toml index 952a92e..dd3a63c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "fif" description = "A command-line tool for detecting and optionally correcting files with incorrect extensions." -version = "0.2.11" +version = "0.2.12" authors = ["Lynnesbian "] edition = "2018" license = "GPL-3.0-or-later" @@ -29,7 +29,7 @@ log = "0.4.14" mime_guess = "2.0.3" snailquote = "0.3.0" once_cell = "1.7.2" -infer = "0.3.6" +infer = "0.4.0" rayon = { version = "1.5.0", optional = true } exitcode = "1.1.2" cfg-if = "1.0.0" diff --git a/src/inspectors.rs b/src/inspectors.rs index 7925569..50b78e7 100644 --- a/src/inspectors.rs +++ b/src/inspectors.rs @@ -20,16 +20,14 @@ use crate::string_type::String; pub const INITIAL_BUF_SIZE: usize = 128; /// The number of bytes to read if the file couldn't be identified from its first [`INITIAL_BUF_SIZE`] bytes. -pub const BUF_SIZE: usize = 4096; +pub const BUF_SIZE: usize = 8192; /// Tries to identify the mimetype of a file from a given path. pub fn mime_type(db: &T, path: &Path) -> io::Result> { let mut buffer = [0; INITIAL_BUF_SIZE]; let mut file = File::open(path)?; - // this lint can be ignored: it's okay if the file isn't long enough to fill the buffer, as we only care about the - // first few bytes for the purpose of mime sniffing - #[allow(clippy::unused_io_amount)] + // read a small amount to start with file.read(&mut buffer)?; let r = db.get_type(&buffer).filter(|mime| @@ -43,8 +41,7 @@ pub fn mime_type(db: &T, path: &Path) -> io::Result> { // doc/ppt/xls files are a subset of what's known as an "OLE2 compound document storage", at least according to // shared-mime-info. if a pre-OOXML era MS office file is scanned and identified as x-ole-storage, reading further // will allow it to be detected correctly as the appropriate filetype. - && mime != &Mime::from_str("application/x-ole-storage").unwrap() - ); + && mime != &Mime::from_str("application/x-ole-storage").unwrap()); if r.is_some() { return Ok(r); From e7637013ebb71ad491e36be729801db1b36750cc Mon Sep 17 00:00:00 2001 From: Lynne Date: Wed, 14 Apr 2021 18:25:46 +1000 Subject: [PATCH 2/3] documented and cleaned up tests, renamed "Script" to "Shell", etc --- src/extension_set.rs | 1 + src/formats.rs | 4 +-- src/main.rs | 4 +-- src/tests/mod.rs | 68 +++++++++++++++++++++++++++----------------- 4 files changed, 47 insertions(+), 30 deletions(-) diff --git a/src/extension_set.rs b/src/extension_set.rs index 377b914..536b504 100644 --- a/src/extension_set.rs +++ b/src/extension_set.rs @@ -35,6 +35,7 @@ impl ExtensionSet { .concat(), Self::Documents => vec![ "pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx", "csv", "tsv", "odt", "ods", "odp", "oda", "rtf", "ps", + "pages", "key", "numbers", ], Self::Text => mime_guess::get_mime_extensions_str("text/*").unwrap().to_vec(), // many compressed file types follow the name scheme "application/x.+compressed.*" - maybe this can be used diff --git a/src/formats.rs b/src/formats.rs index 0bd8b69..f08120e 100644 --- a/src/formats.rs +++ b/src/formats.rs @@ -110,9 +110,9 @@ pub trait Format { } /// Bourne-Shell compatible script. -pub struct Script {} +pub struct Shell {} -impl Format for Script { +impl Format for Shell { fn new() -> Self { Self {} } diff --git a/src/main.rs b/src/main.rs index a614f89..34c27a7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -29,7 +29,7 @@ use rayon::prelude::*; use walkdir::{DirEntry, WalkDir}; use crate::findings::Findings; -use crate::formats::{Format, PowerShell, Script}; +use crate::formats::{Format, PowerShell, Shell}; use crate::mime_db::MimeDb; use crate::parameters::{OutputFormat, ScanOpts}; use crate::scan_error::ScanError; @@ -133,7 +133,7 @@ fn main() { let mut buffered_stdout = BufWriter::new(stdout()); let result = match args.output_format { - OutputFormat::Sh => Script::new().write_all(&results, &mut buffered_stdout), + OutputFormat::Sh => Shell::new().write_all(&results, &mut buffered_stdout), OutputFormat::PowerShell | OutputFormat::Powershell => PowerShell::new().write_all(&results, &mut buffered_stdout), OutputFormat::Text => todo!(), }; diff --git a/src/tests/mod.rs b/src/tests/mod.rs index a67a7fd..f1b4ce6 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -1,19 +1,15 @@ +use crate::findings::Findings; +use crate::formats::{Format, Shell}; use crate::inspectors::{mime_extension_lookup, BUF_SIZE}; -use crate::{extension_from_path, init_db, scan_directory, scan_from_walkdir}; - use crate::mime_db::MimeDb; -use crate::parameters::{Parameters, ScanOpts}; use crate::string_type::String; -use cfg_if::cfg_if; +use crate::{extension_from_path, scan_directory, scan_from_walkdir}; + use mime_guess::mime::{APPLICATION_OCTET_STREAM, APPLICATION_PDF, IMAGE_JPEG, IMAGE_PNG}; use mime_guess::Mime; -use crate::findings::Findings; -use crate::formats::{Format, Script}; -use std::borrow::Borrow; use std::collections::HashMap; use std::ffi::OsStr; -use std::io::Read; use std::path::Path; const JPEG_BYTES: &[u8] = b"\xFF\xD8\xFF"; @@ -21,7 +17,7 @@ const PNG_BYTES: &[u8] = b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A"; const PDF_BYTES: &[u8] = b"%PDF-"; const ZIP_BYTES: &[u8] = b"PK\x03\x04"; -cfg_if! { +cfg_if::cfg_if! { if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] { fn get_mime_db() -> crate::mime_db::InferDb { crate::mime_db::InferDb::init() @@ -39,6 +35,7 @@ fn application_zip() -> Mime { } #[test] +/// Ensure that `extension_from_path` successfully returns the extension from a set of paths. fn get_ext() { let mut ext_checks: HashMap<_, Option<&OsStr>> = HashMap::new(); ext_checks.insert(Path::new("test.txt"), Some(OsStr::new("txt"))); @@ -54,6 +51,7 @@ fn get_ext() { } #[test] +/// Ensure that the mime types for JPEG, PNG, PDF, and ZIP are detected from their magic numbers. fn detect_type() { let db = get_mime_db(); assert_eq!(db.get_type(JPEG_BYTES), Some(IMAGE_JPEG)); @@ -63,6 +61,8 @@ fn detect_type() { } #[test] +/// Ensure that `mime_extension_lookup` works as expected, and that the set of extensions for JPEG, PNG, PDF, and ZIP +/// contain "jpg", "png", "pdf", and "zip", respectively. fn recommend_ext() { assert!(mime_extension_lookup(IMAGE_JPEG) .unwrap() @@ -77,11 +77,17 @@ fn recommend_ext() { } #[test] +/// Create a simple directory with some files, run `scan_directory` on it, and ensure that the files have their +/// associated mime types correctly deduced. fn simple_directory() { + use crate::parameters::ScanOpts; + use std::borrow::Borrow; use std::fs::File; use std::io::Write; use tempfile::tempdir; + // set of files to scan. all but the last files have magic numbers corresponding to their extension, except for + // "wrong.jpg", which is actually a png. let mut files = HashMap::new(); files.insert("test.jpg", JPEG_BYTES); files.insert("test.jpeg", JPEG_BYTES); @@ -107,41 +113,39 @@ fn simple_directory() { follow_symlinks: false, }; - let entries = scan_directory( - &dir.path().to_path_buf(), - Some(&vec!["jpg", "jpeg", "png", "pdf", "zip"]), - &scan_opts, - ) - .expect("Directory scan failed."); + let entries = scan_directory(&dir.path().to_path_buf(), None, &scan_opts).expect("Directory scan failed."); assert_eq!(entries.len(), files.len()); - // initialise global mime DB - init_db(); + // initialise global mime DB - this is needed because `scan_from_walkdir` expects it to be present. + crate::init_db(); let results = scan_from_walkdir(&entries); for result in results { + // there should be no IO errors during this test. any IO errors encountered are outside the scope of this test. let result = result.expect("Error while scanning file"); if !result.valid { - // this should be "wrong.jpg", which is a misnamed png file - // 1. ensure extension is "png" + // the only invalid file detected should be "wrong.jpg", which is a misnamed png file + // 1. ensure detected extension is "jpg" assert_eq!(extension_from_path(&*result.file).unwrap(), OsStr::new("jpg")); - // 2. ensure mime type detected is IMAGE_PNG + // 2. ensure detected mime type is IMAGE_PNG assert_eq!(result.mime, IMAGE_PNG); - // 3. ensure recommended extension is in the list of known extensions for PNG files - assert!(mime_extension_lookup(IMAGE_PNG) - .unwrap() - .contains(&result.recommended_extension().unwrap())); + // 3. ensure the recommended extension for "wrong.jpg" is "png" + assert_eq!(&result.recommended_extension().unwrap(), &String::from("png")); continue; } - // check if the recommended extension for this file is in the list of known extensions for its mimetype + // check if the recommended extension for this file is in the list of known extensions for its mimetype - for + // example, if the file is determined to be an IMAGE_PNG, its recommended extension should be one of the extensions + // returned by `mime_extension_lookup(IMAGE_PNG)`. assert!(mime_extension_lookup(result.mime.clone()) .unwrap() .contains(&result.recommended_extension().unwrap())); // make sure the guessed mimetype is correct based on the extension of the scanned file + // because we already know that the extensions match the mimetype (as we created these files ourselves earlier in + // the test), all files with the "jpg" extension should be IMAGE_JPEGs, etc. let ext = extension_from_path(result.file); assert!(ext.is_some()); assert_eq!( @@ -161,7 +165,10 @@ fn simple_directory() { } #[test] +/// Ensure that command line argument parsing works correctly - flags are interpreted, booleans are set, and so on. fn argument_parsing() { + use crate::parameters::{Parameters, ScanOpts}; + use clap::Clap; // pass `-f`, which enables following symlinks, and `-E images`, which scans files with image extensions @@ -192,7 +199,10 @@ fn argument_parsing() { } #[test] +/// Ensure that badly formed command line arguments are rejected. fn rejects_bad_args() { + use crate::parameters::Parameters; + use clap::Clap; let tests = [ // Non-existent flags: @@ -213,6 +223,8 @@ fn rejects_bad_args() { } #[test] +/// Generate random series of bytes and try to identify them. This test makes no assertions and can only fail if the +/// mime database somehow panics or hangs. fn identify_random_bytes() { let db = get_mime_db(); let rng = fastrand::Rng::new(); @@ -233,7 +245,11 @@ fn identify_random_bytes() { } #[test] +/// Ensure that, for a given file "wrong.bad", which should have extension "good", the shell output contains something +/// like "mv wrong.bad wrong.good". fn outputs_move_commands() { + use std::io::Read; + // create an example finding stating that "misnamed_file.png" has been identified as a jpeg file let entries = vec![Ok(Findings { file: Path::new("misnamed_file.png"), @@ -244,7 +260,7 @@ fn outputs_move_commands() { let mut cursor = std::io::Cursor::new(Vec::new()); let mut contents = std::string::String::new(); - Script::new() + Shell::new() .write_all(&entries, &mut cursor) .expect("Failed to write to cursor"); cursor.set_position(0); From 7f87f59670a29364752f02b6bdcba47be9479ed0 Mon Sep 17 00:00:00 2001 From: Lynne Date: Wed, 14 Apr 2021 18:43:39 +1000 Subject: [PATCH 3/3] updated changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index dfaa5ea..da7ef9a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,11 @@ Dates are given in YYYY-MM-DD format. ## v0.2 +### v0.2.12 (2021-???) +- Added Apple iWork document formats to documents extension set +- Cleaned up and properly documented tests +- Renamed `Script` (in `formats.rs`) to `Shell`, in line with renaming in `paramaters.rs` + ### v0.2.12 (2021-04-14) #### Features - Added Text extension set