more comprehensive ext sets, link text/xml and application/xml, remove silly Powershell/PowerShell hack

This commit is contained in:
Lynne Megido 2021-04-28 19:33:42 +10:00
parent 4f5914ed75
commit cb6e111f16
Signed by: lynnesbian
GPG Key ID: F0A184B5213D9F90
6 changed files with 53 additions and 34 deletions

4
Cargo.lock generated
View File

@ -349,9 +349,9 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
[[package]]
name = "new_mime_guess"
version = "2.0.4"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9eafbe0ec4560a250f49cda59952d7a377b364918d7d8163d71790158c986e54"
checksum = "e714f72c691c7d2b344ec8dd57d7f52b59651f46b9de477fb68363f097d694ae"
dependencies = [
"mime",
"unicase",

View File

@ -26,7 +26,7 @@ xdg-mime-backend = []
[dependencies]
walkdir = "2.3.2"
log = "0.4.14"
mime_guess = { package = "new_mime_guess", version = "2.0.4" }
mime_guess = { package = "new_mime_guess", version = "2.1.0" }
snailquote = "0.3.0"
once_cell = "1.7.2"
infer = "0.4.0"

View File

@ -101,14 +101,17 @@ cached! {
// add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif.
[vec![String::from("jpg")], possible_exts].concat()
} else if mime == mime_guess::mime::TEXT_XML {
// a somewhat similar case arises with XML files - the first suggested extension is "addin", when it should
} else if mime == mime_guess::mime::TEXT_XML || mime == Mime::from_str("application/xml").unwrap() {
// a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should
// (in my opinion) be "xml".
// there's also another problem: SVG files can easily be misidentified as XML files, because they usually
// *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read
// before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg"
// as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered
// to have valid extensions.
// TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to
// "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its
// extension is classes as application/*+xml, consider it OK
[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
} else if mime == Mime::from_str("application/msword").unwrap() {

View File

@ -34,6 +34,7 @@ use crate::formats::{Format, PowerShell, Shell};
use crate::mime_db::MimeDb;
use crate::parameters::{OutputFormat, ScanOpts};
use crate::scan_error::ScanError;
use std::collections::BTreeSet;
mod findings;
mod formats;
@ -143,7 +144,7 @@ fn main() {
let result = match args.output_format {
OutputFormat::Sh => Shell::new().write_all(&results, &mut buffered_stdout),
OutputFormat::PowerShell | OutputFormat::Powershell => PowerShell::new().write_all(&results, &mut buffered_stdout),
OutputFormat::PowerShell => PowerShell::new().write_all(&results, &mut buffered_stdout),
OutputFormat::Text => todo!(),
};
@ -184,7 +185,12 @@ cfg_if! {
/// Returns `true` if a file matches the given criteria. This means checking whether the file's extension appears in
/// `exts` (if specified), potentially skipping over hidden files, and so on.
fn wanted_file(entry: &DirEntry, exts: Option<&Vec<&str>>, exclude: Option<&Vec<&str>>, scan_opts: &ScanOpts) -> bool {
fn wanted_file(
entry: &DirEntry,
exts: Option<&BTreeSet<&str>>,
exclude: Option<&BTreeSet<&str>>,
scan_opts: &ScanOpts,
) -> bool {
if entry.depth() == 0 {
// the root directory should always be scanned.
return true;
@ -292,8 +298,8 @@ fn scan_from_walkdir(entries: &[DirEntry]) -> Vec<Result<Findings, ScanError>> {
/// [DirEntry]s.
fn scan_directory(
dirs: &Path,
exts: Option<&Vec<&str>>,
exclude: Option<&Vec<&str>>,
exts: Option<&BTreeSet<&str>>,
exclude: Option<&BTreeSet<&str>>,
scan_opts: &ScanOpts,
) -> Option<Vec<DirEntry>> {
let stepper = WalkDir::new(dirs).follow_links(scan_opts.follow_symlinks).into_iter();

View File

@ -3,6 +3,7 @@
use crate::string_type::String as StringType;
use cfg_if::cfg_if;
use clap::{AppSettings, Clap};
use std::collections::BTreeSet;
use std::path::PathBuf;
cfg_if! {
@ -18,9 +19,8 @@ pub enum OutputFormat {
/// A Bourne shell compatible script.
Sh,
/// A PowerShell script.
#[clap(alias = "powershell")]
PowerShell,
/// Also a PowerShell script, with different casing to allow for `fif -o powershell`.
Powershell,
/// Plain text.
Text,
}
@ -44,7 +44,6 @@ pub struct Parameters {
// `-e sil\,ly` is treated as ["sil", "ly"] rather than as ["silly"], no matter how i escape the comma (in bash,
// anyway). is this really an issue? it does technically exclude some perfectly valid extensions, but i've never seen
// a file extension with a comma in its name before.
/// Only examine files with these extensions.
/// Multiple extensions can be specified by either using the flag multiple times (`-e jpg -e png -e gif`), or by
/// separating them with commas (`-e jpg,png,gif`).
@ -113,7 +112,7 @@ pub struct ScanOpts {
impl Parameters {
/// Returns an optional vec of the extensions to be scanned - i.e., extensions specified via the `-e` or `-E` flag,
/// minus the extensions excluded with the `-x` flag; i.e., the difference between the included and excluded sets.
pub fn extensions(&self) -> Option<Vec<&str>> {
pub fn extensions(&self) -> Option<BTreeSet<&str>> {
if let Some(included) = self.included_extensions() {
if let Some(excluded) = self.excluded_extensions() {
// return included extensions without excluded extensions
@ -131,30 +130,34 @@ impl Parameters {
/// Returns an optional vec of extensions that were specified by `-e` or `-E`. Note that this doesn't account for
/// extensions excluded by the exclusion flags.
pub fn included_extensions(&self) -> Option<Vec<&str>> {
let mut included = vec![];
if let Some(exts) = self.exts.as_ref() { // -e
pub fn included_extensions(&self) -> Option<BTreeSet<&str>> {
let mut included = BTreeSet::new();
if let Some(exts) = self.exts.as_ref() {
// -e
included.extend(exts.iter().map(|ext| ext.as_str()));
}
if !&self.ext_set.is_empty() { // -E
if !&self.ext_set.is_empty() {
// -E
included.extend(self.ext_set.iter().flat_map(|set| set.extensions()));
}
match included {
x if x.is_empty() => None,
x => Some(x)
x => Some(x),
}
}
/// Returns an optional vec of extensions that were specified by `-x` or `-X`.
pub fn excluded_extensions(&self) -> Option<Vec<&str>> {
let mut excluded = vec![];
if let Some(exclude) = self.exclude.as_ref() { // -x
pub fn excluded_extensions(&self) -> Option<BTreeSet<&str>> {
let mut excluded = BTreeSet::new();
if let Some(exclude) = self.exclude.as_ref() {
// -x
excluded.extend(exclude.iter().map(|ext| ext.as_str()));
}
if !&self.exclude_set.is_empty() { // -X
if !&self.exclude_set.is_empty() {
// -X
excluded.extend(self.exclude_set.iter().flat_map(|set| set.extensions()));
}
@ -162,7 +165,7 @@ impl Parameters {
// tongue twister: enter X-options' excellent extension exclusion
match excluded {
x if x.is_empty() => None,
x => Some(x)
x => Some(x),
}
}
@ -226,13 +229,16 @@ impl ExtensionSet {
"pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx", "csv", "tsv", "odt", "ods", "odp", "oda", "rtf", "ps",
"pages", "key", "numbers",
],
Self::Text => mime_guess::get_mime_extensions_str("text/*").unwrap().to_vec(),
Self::Text => [mime_guess::get_mime_extensions_str("text/*").unwrap(), &["js", "pl", "csh", "sh", "bash", "zsh", "fish", "bat", "php"]].concat(),
// many compressed file types follow the name scheme "application/x.+compressed.*" - maybe this can be used
// somehow to extract extensions for compressed files from mime_guess?
Self::Archives => vec!["zip", "tar", "gz", "zst", "xz", "rar", "7z", "bz", "bz2", "tgz", "rpa"],
Self::Archives => vec![
"zip", "tar", "gz", "zst", "xz", "rar", "7z", "bz", "bz2", "tgz", "rpa", "txz", "tz2", "sea", "sitx", "z",
"cpio",
],
Self::System => vec![
"com", "dll", "exe", "sys", "reg", "nt", "cpl", "msi", "efi", "bio", "rcv", "mbr", "sbf", "grub", "ko",
"dylib", "pdb", "hdmp", "crash",
"dylib", "pdb", "hdmp", "crash", "cab",
],
}
}

View File

@ -5,15 +5,15 @@ use crate::mime_db::MimeDb;
use crate::string_type::String;
use crate::{extension_from_path, scan_directory, scan_from_walkdir};
use mime_guess::mime::{APPLICATION_OCTET_STREAM, APPLICATION_PDF, IMAGE_JPEG, IMAGE_PNG};
use mime_guess::Mime;
use crate::parameters::Parameters;
use clap::Clap;
use mime_guess::mime::{APPLICATION_OCTET_STREAM, APPLICATION_PDF, IMAGE_JPEG, IMAGE_PNG};
use mime_guess::Mime;
use crate::parameters::ExtensionSet;
use std::collections::HashMap;
use std::ffi::OsStr;
use std::path::{Path, PathBuf};
use crate::parameters::ExtensionSet;
const JPEG_BYTES: &[u8] = b"\xFF\xD8\xFF";
const PNG_BYTES: &[u8] = b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A";
@ -265,7 +265,11 @@ fn exclude_set_overrides_include_set() {
let extensions = extensions.unwrap();
// ensure all of audio and video's extensions are here
for &ext in ExtensionSet::Audio.extensions().iter().chain(ExtensionSet::Video.extensions().iter()) {
for &ext in ExtensionSet::Audio
.extensions()
.iter()
.chain(ExtensionSet::Video.extensions().iter())
{
assert!(extensions.contains(&ext), "Extensions should contain {}!", ext)
}
@ -361,10 +365,10 @@ fn media_contains_audio_video_images() {
.into_iter()
.for_each(|ext| assert!(media_exts.contains(&ext)));
// assert_eq!(
// Parameters::parse_from(&["fif", "-E", "media"]).extensions(),
// Parameters::parse_from(&["fif", "-E", "audio,video,images"]).extensions()
// )
assert_eq!(
Parameters::parse_from(&["fif", "-E", "media"]).extensions(),
Parameters::parse_from(&["fif", "-E", "audio,video,images"]).extensions()
)
}
#[test]