diff --git a/Cargo.lock b/Cargo.lock index 880b12a..54afde4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -349,9 +349,9 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" [[package]] name = "new_mime_guess" -version = "2.0.4" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9eafbe0ec4560a250f49cda59952d7a377b364918d7d8163d71790158c986e54" +checksum = "e714f72c691c7d2b344ec8dd57d7f52b59651f46b9de477fb68363f097d694ae" dependencies = [ "mime", "unicase", diff --git a/Cargo.toml b/Cargo.toml index 9a54ade..47c9a7b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,7 @@ xdg-mime-backend = [] [dependencies] walkdir = "2.3.2" log = "0.4.14" -mime_guess = { package = "new_mime_guess", version = "2.0.4" } +mime_guess = { package = "new_mime_guess", version = "2.1.0" } snailquote = "0.3.0" once_cell = "1.7.2" infer = "0.4.0" diff --git a/src/inspectors.rs b/src/inspectors.rs index b172728..35577ee 100644 --- a/src/inspectors.rs +++ b/src/inspectors.rs @@ -101,14 +101,17 @@ cached! { // add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif. [vec![String::from("jpg")], possible_exts].concat() - } else if mime == mime_guess::mime::TEXT_XML { - // a somewhat similar case arises with XML files - the first suggested extension is "addin", when it should + } else if mime == mime_guess::mime::TEXT_XML || mime == Mime::from_str("application/xml").unwrap() { + // a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should // (in my opinion) be "xml". // there's also another problem: SVG files can easily be misidentified as XML files, because they usually // *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read // before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg" // as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered // to have valid extensions. + // TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to + // "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its + // extension is classes as application/*+xml, consider it OK [vec![String::from("xml"), String::from("svg")], possible_exts].concat() } else if mime == Mime::from_str("application/msword").unwrap() { diff --git a/src/main.rs b/src/main.rs index 0fd2bcb..1181b6b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -34,6 +34,7 @@ use crate::formats::{Format, PowerShell, Shell}; use crate::mime_db::MimeDb; use crate::parameters::{OutputFormat, ScanOpts}; use crate::scan_error::ScanError; +use std::collections::BTreeSet; mod findings; mod formats; @@ -143,7 +144,7 @@ fn main() { let result = match args.output_format { OutputFormat::Sh => Shell::new().write_all(&results, &mut buffered_stdout), - OutputFormat::PowerShell | OutputFormat::Powershell => PowerShell::new().write_all(&results, &mut buffered_stdout), + OutputFormat::PowerShell => PowerShell::new().write_all(&results, &mut buffered_stdout), OutputFormat::Text => todo!(), }; @@ -184,7 +185,12 @@ cfg_if! { /// Returns `true` if a file matches the given criteria. This means checking whether the file's extension appears in /// `exts` (if specified), potentially skipping over hidden files, and so on. -fn wanted_file(entry: &DirEntry, exts: Option<&Vec<&str>>, exclude: Option<&Vec<&str>>, scan_opts: &ScanOpts) -> bool { +fn wanted_file( + entry: &DirEntry, + exts: Option<&BTreeSet<&str>>, + exclude: Option<&BTreeSet<&str>>, + scan_opts: &ScanOpts, +) -> bool { if entry.depth() == 0 { // the root directory should always be scanned. return true; @@ -292,8 +298,8 @@ fn scan_from_walkdir(entries: &[DirEntry]) -> Vec> { /// [DirEntry]s. fn scan_directory( dirs: &Path, - exts: Option<&Vec<&str>>, - exclude: Option<&Vec<&str>>, + exts: Option<&BTreeSet<&str>>, + exclude: Option<&BTreeSet<&str>>, scan_opts: &ScanOpts, ) -> Option> { let stepper = WalkDir::new(dirs).follow_links(scan_opts.follow_symlinks).into_iter(); diff --git a/src/parameters.rs b/src/parameters.rs index 45ef9c7..766117d 100644 --- a/src/parameters.rs +++ b/src/parameters.rs @@ -3,6 +3,7 @@ use crate::string_type::String as StringType; use cfg_if::cfg_if; use clap::{AppSettings, Clap}; +use std::collections::BTreeSet; use std::path::PathBuf; cfg_if! { @@ -18,9 +19,8 @@ pub enum OutputFormat { /// A Bourne shell compatible script. Sh, /// A PowerShell script. + #[clap(alias = "powershell")] PowerShell, - /// Also a PowerShell script, with different casing to allow for `fif -o powershell`. - Powershell, /// Plain text. Text, } @@ -44,7 +44,6 @@ pub struct Parameters { // `-e sil\,ly` is treated as ["sil", "ly"] rather than as ["silly"], no matter how i escape the comma (in bash, // anyway). is this really an issue? it does technically exclude some perfectly valid extensions, but i've never seen // a file extension with a comma in its name before. - /// Only examine files with these extensions. /// Multiple extensions can be specified by either using the flag multiple times (`-e jpg -e png -e gif`), or by /// separating them with commas (`-e jpg,png,gif`). @@ -113,7 +112,7 @@ pub struct ScanOpts { impl Parameters { /// Returns an optional vec of the extensions to be scanned - i.e., extensions specified via the `-e` or `-E` flag, /// minus the extensions excluded with the `-x` flag; i.e., the difference between the included and excluded sets. - pub fn extensions(&self) -> Option> { + pub fn extensions(&self) -> Option> { if let Some(included) = self.included_extensions() { if let Some(excluded) = self.excluded_extensions() { // return included extensions without excluded extensions @@ -131,30 +130,34 @@ impl Parameters { /// Returns an optional vec of extensions that were specified by `-e` or `-E`. Note that this doesn't account for /// extensions excluded by the exclusion flags. - pub fn included_extensions(&self) -> Option> { - let mut included = vec![]; - if let Some(exts) = self.exts.as_ref() { // -e + pub fn included_extensions(&self) -> Option> { + let mut included = BTreeSet::new(); + if let Some(exts) = self.exts.as_ref() { + // -e included.extend(exts.iter().map(|ext| ext.as_str())); } - if !&self.ext_set.is_empty() { // -E + if !&self.ext_set.is_empty() { + // -E included.extend(self.ext_set.iter().flat_map(|set| set.extensions())); } match included { x if x.is_empty() => None, - x => Some(x) + x => Some(x), } } /// Returns an optional vec of extensions that were specified by `-x` or `-X`. - pub fn excluded_extensions(&self) -> Option> { - let mut excluded = vec![]; - if let Some(exclude) = self.exclude.as_ref() { // -x + pub fn excluded_extensions(&self) -> Option> { + let mut excluded = BTreeSet::new(); + if let Some(exclude) = self.exclude.as_ref() { + // -x excluded.extend(exclude.iter().map(|ext| ext.as_str())); } - if !&self.exclude_set.is_empty() { // -X + if !&self.exclude_set.is_empty() { + // -X excluded.extend(self.exclude_set.iter().flat_map(|set| set.extensions())); } @@ -162,7 +165,7 @@ impl Parameters { // tongue twister: enter X-options' excellent extension exclusion match excluded { x if x.is_empty() => None, - x => Some(x) + x => Some(x), } } @@ -226,13 +229,16 @@ impl ExtensionSet { "pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx", "csv", "tsv", "odt", "ods", "odp", "oda", "rtf", "ps", "pages", "key", "numbers", ], - Self::Text => mime_guess::get_mime_extensions_str("text/*").unwrap().to_vec(), + Self::Text => [mime_guess::get_mime_extensions_str("text/*").unwrap(), &["js", "pl", "csh", "sh", "bash", "zsh", "fish", "bat", "php"]].concat(), // many compressed file types follow the name scheme "application/x.+compressed.*" - maybe this can be used // somehow to extract extensions for compressed files from mime_guess? - Self::Archives => vec!["zip", "tar", "gz", "zst", "xz", "rar", "7z", "bz", "bz2", "tgz", "rpa"], + Self::Archives => vec![ + "zip", "tar", "gz", "zst", "xz", "rar", "7z", "bz", "bz2", "tgz", "rpa", "txz", "tz2", "sea", "sitx", "z", + "cpio", + ], Self::System => vec![ "com", "dll", "exe", "sys", "reg", "nt", "cpl", "msi", "efi", "bio", "rcv", "mbr", "sbf", "grub", "ko", - "dylib", "pdb", "hdmp", "crash", + "dylib", "pdb", "hdmp", "crash", "cab", ], } } diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 3fbd9f3..746165d 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -5,15 +5,15 @@ use crate::mime_db::MimeDb; use crate::string_type::String; use crate::{extension_from_path, scan_directory, scan_from_walkdir}; -use mime_guess::mime::{APPLICATION_OCTET_STREAM, APPLICATION_PDF, IMAGE_JPEG, IMAGE_PNG}; -use mime_guess::Mime; use crate::parameters::Parameters; use clap::Clap; +use mime_guess::mime::{APPLICATION_OCTET_STREAM, APPLICATION_PDF, IMAGE_JPEG, IMAGE_PNG}; +use mime_guess::Mime; +use crate::parameters::ExtensionSet; use std::collections::HashMap; use std::ffi::OsStr; use std::path::{Path, PathBuf}; -use crate::parameters::ExtensionSet; const JPEG_BYTES: &[u8] = b"\xFF\xD8\xFF"; const PNG_BYTES: &[u8] = b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A"; @@ -265,7 +265,11 @@ fn exclude_set_overrides_include_set() { let extensions = extensions.unwrap(); // ensure all of audio and video's extensions are here - for &ext in ExtensionSet::Audio.extensions().iter().chain(ExtensionSet::Video.extensions().iter()) { + for &ext in ExtensionSet::Audio + .extensions() + .iter() + .chain(ExtensionSet::Video.extensions().iter()) + { assert!(extensions.contains(&ext), "Extensions should contain {}!", ext) } @@ -361,10 +365,10 @@ fn media_contains_audio_video_images() { .into_iter() .for_each(|ext| assert!(media_exts.contains(&ext))); - // assert_eq!( - // Parameters::parse_from(&["fif", "-E", "media"]).extensions(), - // Parameters::parse_from(&["fif", "-E", "audio,video,images"]).extensions() - // ) + assert_eq!( + Parameters::parse_from(&["fif", "-E", "media"]).extensions(), + Parameters::parse_from(&["fif", "-E", "audio,video,images"]).extensions() + ) } #[test]