diff --git a/src/files.rs b/src/files.rs index 3f484f6..ad93c65 100644 --- a/src/files.rs +++ b/src/files.rs @@ -1,11 +1,20 @@ use crate::findings::{Findings, ScanError}; +use crate::mime_db::MimeDb; use crate::parameters::ScanOpts; -use crate::{inspectors, MIMEDB}; +use crate::MIMEDB; + +use std::collections::BTreeSet; +use std::fs::File; +use std::io; +use std::io::{Read, Seek, SeekFrom}; +use std::path::Path; +use std::str::FromStr; + +use cached::cached; use cfg_if::cfg_if; use log::{debug, error}; +use mime::Mime; use mime_guess::from_ext; -use std::collections::BTreeSet; -use std::path::Path; use walkdir::{DirEntry, WalkDir}; cfg_if! { @@ -84,7 +93,7 @@ pub fn wanted_file( pub fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result { let path = entry.path(); // try to determine mimetype for this entry - let result = match inspectors::mime_type(MIMEDB.get().unwrap(), path) { + let result = match mime_type(MIMEDB.get().unwrap(), path) { // an error occurred while trying to read the file Err(_) => return Err(ScanError::File(path)), // the file was read successfully, but we were unable to determine its mimetype @@ -94,7 +103,7 @@ pub fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result(db: &T, path: &Path) -> io::Result> { + let mut buffer = [0; INITIAL_BUF_SIZE]; + let mut file = File::open(path)?; + + // read a small amount to start with + file.read(&mut buffer)?; + + let r = db.get_type(&buffer).filter(|mime| + // some mime types should be investigated further, reading up to BUF_SIZE even if they've been determined already + // one such type is XML - there's many more specific types that can be determined by reading further (such as SVG) + mime != &mime::TEXT_XML + // another is ZIP - many file formats (DOCX, ODT, JAR...) are just ZIP files with particular data structures. + // determining that a file is in one of the MS office formats in particular requires looking quite far into the + // file. + && mime != &Mime::from_str("application/zip").unwrap() + // doc/ppt/xls files are a subset of what's known as an "OLE2 compound document storage", at least according to + // shared-mime-info. if a pre-OOXML era MS office file is scanned and identified as x-ole-storage, reading further + // will allow it to be detected correctly as the appropriate filetype. + && mime != &Mime::from_str("application/x-ole-storage").unwrap()); + + if r.is_some() { + return Ok(r); + } + + // attempt to read up to the BUF_SIZE bytes of the file. + // we've already read the first 128 bytes into a buffer, but i can't see an obvious way to reuse them in a way that's + // faster than simply moving the seek position back to the start of the file and re-reading the whole BUF_SIZE bytes. + // for example, reading (BUF_SIZE - INITIAL_BUF_SIZE) bytes, then concatenating the original INITIAL_BUF_SIZE buffer + // with this new one would presumably be faster - but it's not. i think it's more expensive to create the iterator, + // collect the contents, etc. i'll have to look into this more. i don't at all doubt that there IS a way to do this + // efficiently, and i can think of a way in principle, but i'm not sure how to express it in a way that is both + // idiomatic/safe and fast. + let mut buffer = [0; BUF_SIZE]; + file.seek(SeekFrom::Start(0))?; + file.read(&mut buffer)?; + Ok(db.get_type(&buffer)) +} + +cached! { + MIMEXT; + fn mime_extension_lookup(essence: String) -> Option> = { + // Returns a list of known extensions for this mime type, if any. + // This function uses the [Mime]'s "essence" rather than the [Mime] itself - mime_guess::get_mime_extensions ignores + // the type suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. Passing the + // essence_str (which includes the suffix) fixes this. + // ↑ this is supposed to be a doc comment, but the cached! macro doesn't support that... i would switch to the + // proc_macro version of cached, but it has a huge number of deps :c + + let essence = essence.as_str(); + let mut exts = mime_guess::get_mime_extensions_str(essence); + if exts.is_none() { + // no matches :c + // mime_guess' database isn't exactly perfect... there are a lot of times where the db will return "some/x-thing" + // but mime_guess only understands "some/thing", or vice-versa. + // so, if there appear to be no extensions, try replacing "some/x-thing" with "some/thing", or "some/thing" with + // "some/x-thing". + if essence.contains("/x-") { + // replace e.g. "application/x-gzip" with "application/gzip" + exts = mime_guess::get_mime_extensions_str(&essence.replace("/x-", "/")); + } else { + // replace e.g. "video/mp2t" with "video/x-mp2t" + exts = mime_guess::get_mime_extensions_str(&essence.replace("/", "/x-")); + } + } + + match exts { + Some(exts) => { + let possible_exts: Vec = exts.iter().map(|e| String::from(*e)).collect(); + + Some(if essence == mime::IMAGE_JPEG.essence_str() { + // possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are + // far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can + // add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif. + [vec![String::from("jpg")], possible_exts].concat() + + } else if essence == mime::TEXT_XML.essence_str() || essence == "application/xml" { + // a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should + // (in my opinion) be "xml". + // there's also another problem: SVG files can easily be misidentified as XML files, because they usually + // *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read + // before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg" + // as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered + // to have valid extensions. + // TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to + // "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its + // extension is classed as application/*+xml, consider it OK + [vec![String::from("xml"), String::from("svg")], possible_exts].concat() + + } else if essence == "application/msword" { + // classic office files considered harmful + vec![String::from("doc"), String::from("xls"), String::from("ppt")] + + } else if essence == "application/zip" { + // neither xdg-mime nor infer seem to be able to detect office XML files properly... + [vec![String::from("zip"), String::from("docx"), String::from("xlsx"), String::from("pptx")], possible_exts].concat() + + } else if essence == "application/x-ms-dos-executable" { + // both .dll and .exe files are given the same mime type... but you definitely don't want to rename one to the + // other! + [vec![String::from("dll"), String::from("exe")], possible_exts].concat() + } else { + possible_exts + }) + }, + None => None + } + } +} diff --git a/src/findings.rs b/src/findings.rs index 01c3223..94b48a0 100644 --- a/src/findings.rs +++ b/src/findings.rs @@ -2,7 +2,7 @@ use std::path::{Path, PathBuf}; use mime::Mime; -use crate::inspectors::mime_extension_lookup; +use crate::files::mime_extension_lookup; use crate::String; #[cfg(feature = "json")] @@ -37,7 +37,7 @@ impl serde::Serialize for Findings { } impl Findings { - pub fn recommended_extension(&self) -> Option { + pub fn recommended_extension(&self) -> Option { mime_extension_lookup(self.mime.essence_str().into()).map(|extensions| extensions[0].clone()) } } diff --git a/src/inspectors.rs b/src/inspectors.rs deleted file mode 100644 index b39db5b..0000000 --- a/src/inspectors.rs +++ /dev/null @@ -1,133 +0,0 @@ -//! Functions for getting the mime type and extension of a file. - -use std::fs::File; -use std::io; -use std::io::{Read, Seek, SeekFrom}; -use std::path::Path; -use std::str::FromStr; - -use cached::cached; -use mime::Mime; - -use crate::String; -use crate::MimeDb; - -/// The number of bytes to read initially. -/// -/// Rather than reading the entire file all at once into a [`BUF_SIZE`] buffer, it tends to be faster to read a small -/// chunk of the file and trying to identify that, proceeding with the larger buffer if that fails. Many file formats -/// can be identified with the first few dozen bytes, so the "happy path" will likely be taken in the majority of cases. -pub const INITIAL_BUF_SIZE: usize = 128; - -/// The number of bytes to read if the file couldn't be identified from its first [`INITIAL_BUF_SIZE`] bytes. -pub const BUF_SIZE: usize = 8192; - -/// Tries to identify the mimetype of a file from a given path. -pub fn mime_type(db: &T, path: &Path) -> io::Result> { - let mut buffer = [0; INITIAL_BUF_SIZE]; - let mut file = File::open(path)?; - - // read a small amount to start with - file.read(&mut buffer)?; - - let r = db.get_type(&buffer).filter(|mime| - // some mime types should be investigated further, reading up to BUF_SIZE even if they've been determined already - // one such type is XML - there's many more specific types that can be determined by reading further (such as SVG) - mime != &mime::TEXT_XML - // another is ZIP - many file formats (DOCX, ODT, JAR...) are just ZIP files with particular data structures. - // determining that a file is in one of the MS office formats in particular requires looking quite far into the - // file. - && mime != &Mime::from_str("application/zip").unwrap() - // doc/ppt/xls files are a subset of what's known as an "OLE2 compound document storage", at least according to - // shared-mime-info. if a pre-OOXML era MS office file is scanned and identified as x-ole-storage, reading further - // will allow it to be detected correctly as the appropriate filetype. - && mime != &Mime::from_str("application/x-ole-storage").unwrap()); - - if r.is_some() { - return Ok(r); - } - - // attempt to read up to the BUF_SIZE bytes of the file. - // we've already read the first 128 bytes into a buffer, but i can't see an obvious way to reuse them in a way that's - // faster than simply moving the seek position back to the start of the file and re-reading the whole BUF_SIZE bytes. - // for example, reading (BUF_SIZE - INITIAL_BUF_SIZE) bytes, then concatenating the original INITIAL_BUF_SIZE buffer - // with this new one would presumably be faster - but it's not. i think it's more expensive to create the iterator, - // collect the contents, etc. i'll have to look into this more. i don't at all doubt that there IS a way to do this - // efficiently, and i can think of a way in principle, but i'm not sure how to express it in a way that is both - // idiomatic/safe and fast. - let mut buffer = [0; BUF_SIZE]; - file.seek(SeekFrom::Start(0))?; - file.read(&mut buffer)?; - Ok(db.get_type(&buffer)) -} - -cached! { - MIMEXT; - fn mime_extension_lookup(essence: String) -> Option> = { - // Returns a list of known extensions for this mime type, if any. - // This function uses the [Mime]'s "essence" rather than the [Mime] itself - mime_guess::get_mime_extensions ignores - // the type suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. Passing the - // essence_str (which includes the suffix) fixes this. - // ↑ this is supposed to be a doc comment, but the cached! macro doesn't support that... i would switch to the - // proc_macro version of cached, but it has a huge number of deps :c - - let essence = essence.as_str(); - let mut exts = mime_guess::get_mime_extensions_str(essence); - if exts.is_none() { - // no matches :c - // mime_guess' database isn't exactly perfect... there are a lot of times where the db will return "some/x-thing" - // but mime_guess only understands "some/thing", or vice-versa. - // so, if there appear to be no extensions, try replacing "some/x-thing" with "some/thing", or "some/thing" with - // "some/x-thing". - if essence.contains("/x-") { - // replace e.g. "application/x-gzip" with "application/gzip" - exts = mime_guess::get_mime_extensions_str(&essence.replace("/x-", "/")); - } else { - // replace e.g. "video/mp2t" with "video/x-mp2t" - exts = mime_guess::get_mime_extensions_str(&essence.replace("/", "/x-")); - } - } - - match exts { - Some(exts) => { - let possible_exts: Vec = exts.iter().map(|e| String::from(*e)).collect(); - - Some(if essence == mime::IMAGE_JPEG.essence_str() { - // possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are - // far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can - // add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif. - [vec![String::from("jpg")], possible_exts].concat() - - } else if essence == mime::TEXT_XML.essence_str() || essence == "application/xml" { - // a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should - // (in my opinion) be "xml". - // there's also another problem: SVG files can easily be misidentified as XML files, because they usually - // *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read - // before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg" - // as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered - // to have valid extensions. - // TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to - // "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its - // extension is classed as application/*+xml, consider it OK - [vec![String::from("xml"), String::from("svg")], possible_exts].concat() - - } else if essence == "application/msword" { - // classic office files considered harmful - vec![String::from("doc"), String::from("xls"), String::from("ppt")] - - } else if essence == "application/zip" { - // neither xdg-mime nor infer seem to be able to detect office XML files properly... - [vec![String::from("zip"), String::from("docx"), String::from("xlsx"), String::from("pptx")], possible_exts].concat() - - } else if essence == "application/x-ms-dos-executable" { - // both .dll and .exe files are given the same mime type... but you definitely don't want to rename one to the - // other! - [vec![String::from("dll"), String::from("exe")], possible_exts].concat() - } else { - possible_exts - }) - }, - None => None - } - } -} diff --git a/src/lib.rs b/src/lib.rs index 32436e2..41449fe 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,7 +4,6 @@ pub mod files; pub mod findings; pub mod formats; -pub mod inspectors; pub mod mime_db; pub mod parameters; pub mod utils; diff --git a/src/parameters.rs b/src/parameters.rs index f567a93..918ca9b 100644 --- a/src/parameters.rs +++ b/src/parameters.rs @@ -1,7 +1,7 @@ //! [Clap] struct used to parse command line arguments. -use crate::String as StringType; use crate::utils::{clap_long_version, clap_version}; +use crate::String as StringType; use cfg_if::cfg_if; use clap::{AppSettings, Clap}; use std::collections::BTreeSet; diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 8b32f6c..04c4fd8 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -1,7 +1,7 @@ +use fif::files::{mime_extension_lookup, BUF_SIZE}; use fif::files::{scan_directory, scan_from_walkdir}; use fif::findings::Findings; use fif::formats::{Format, PowerShell, Shell}; -use fif::inspectors::{mime_extension_lookup, BUF_SIZE}; use fif::mime_db::MimeDb; use fif::String; @@ -66,18 +66,19 @@ fn detect_type() { /// Ensure that `mime_extension_lookup` works as expected, and that the set of extensions for JPEG, PNG, PDF, and ZIP /// contain "jpg", "png", "pdf", and "zip", respectively. fn recommend_ext() { + use std::string::String as StdString; assert!(mime_extension_lookup(IMAGE_JPEG.essence_str().into()) .unwrap() - .contains(&String::from("jpg"))); + .contains(&StdString::from("jpg"))); assert!(mime_extension_lookup(IMAGE_PNG.essence_str().into()) .unwrap() - .contains(&String::from("png"))); + .contains(&StdString::from("png"))); assert!(mime_extension_lookup(APPLICATION_PDF.essence_str().into()) .unwrap() - .contains(&String::from("pdf"))); + .contains(&StdString::from("pdf"))); assert!(mime_extension_lookup(application_zip().essence_str().into()) .unwrap() - .contains(&String::from("zip"))); + .contains(&StdString::from("zip"))); } #[test]