merged inspectors module into files
This commit is contained in:
parent
ebf2f152f0
commit
fa49dd9fb5
6 changed files with 143 additions and 147 deletions
139
src/files.rs
139
src/files.rs
|
@ -1,11 +1,20 @@
|
|||
use crate::findings::{Findings, ScanError};
|
||||
use crate::mime_db::MimeDb;
|
||||
use crate::parameters::ScanOpts;
|
||||
use crate::{inspectors, MIMEDB};
|
||||
use crate::MIMEDB;
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::io::{Read, Seek, SeekFrom};
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
|
||||
use cached::cached;
|
||||
use cfg_if::cfg_if;
|
||||
use log::{debug, error};
|
||||
use mime::Mime;
|
||||
use mime_guess::from_ext;
|
||||
use std::collections::BTreeSet;
|
||||
use std::path::Path;
|
||||
use walkdir::{DirEntry, WalkDir};
|
||||
|
||||
cfg_if! {
|
||||
|
@ -84,7 +93,7 @@ pub fn wanted_file(
|
|||
pub fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result<Findings, ScanError> {
|
||||
let path = entry.path();
|
||||
// try to determine mimetype for this entry
|
||||
let result = match inspectors::mime_type(MIMEDB.get().unwrap(), path) {
|
||||
let result = match mime_type(MIMEDB.get().unwrap(), path) {
|
||||
// an error occurred while trying to read the file
|
||||
Err(_) => return Err(ScanError::File(path)),
|
||||
// the file was read successfully, but we were unable to determine its mimetype
|
||||
|
@ -94,7 +103,7 @@ pub fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result<Findings, Sc
|
|||
};
|
||||
|
||||
// set of known extensions for the given mimetype
|
||||
let known_exts = inspectors::mime_extension_lookup(result.essence_str().into());
|
||||
let known_exts = mime_extension_lookup(result.essence_str().into());
|
||||
// file extension for this particular file
|
||||
let entry_ext = path.extension();
|
||||
|
||||
|
@ -196,3 +205,123 @@ pub fn scan_directory(
|
|||
Some(entries)
|
||||
}
|
||||
}
|
||||
|
||||
/// The number of bytes to read initially.
|
||||
///
|
||||
/// Rather than reading the entire file all at once into a [`BUF_SIZE`] buffer, it tends to be faster to read a small
|
||||
/// chunk of the file and trying to identify that, proceeding with the larger buffer if that fails. Many file formats
|
||||
/// can be identified with the first few dozen bytes, so the "happy path" will likely be taken in the majority of cases.
|
||||
pub const INITIAL_BUF_SIZE: usize = 128;
|
||||
|
||||
/// The number of bytes to read if the file couldn't be identified from its first [`INITIAL_BUF_SIZE`] bytes.
|
||||
pub const BUF_SIZE: usize = 8192;
|
||||
|
||||
/// Tries to identify the mimetype of a file from a given path.
|
||||
pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
|
||||
let mut buffer = [0; INITIAL_BUF_SIZE];
|
||||
let mut file = File::open(path)?;
|
||||
|
||||
// read a small amount to start with
|
||||
file.read(&mut buffer)?;
|
||||
|
||||
let r = db.get_type(&buffer).filter(|mime|
|
||||
// some mime types should be investigated further, reading up to BUF_SIZE even if they've been determined already
|
||||
// one such type is XML - there's many more specific types that can be determined by reading further (such as SVG)
|
||||
mime != &mime::TEXT_XML
|
||||
// another is ZIP - many file formats (DOCX, ODT, JAR...) are just ZIP files with particular data structures.
|
||||
// determining that a file is in one of the MS office formats in particular requires looking quite far into the
|
||||
// file.
|
||||
&& mime != &Mime::from_str("application/zip").unwrap()
|
||||
// doc/ppt/xls files are a subset of what's known as an "OLE2 compound document storage", at least according to
|
||||
// shared-mime-info. if a pre-OOXML era MS office file is scanned and identified as x-ole-storage, reading further
|
||||
// will allow it to be detected correctly as the appropriate filetype.
|
||||
&& mime != &Mime::from_str("application/x-ole-storage").unwrap());
|
||||
|
||||
if r.is_some() {
|
||||
return Ok(r);
|
||||
}
|
||||
|
||||
// attempt to read up to the BUF_SIZE bytes of the file.
|
||||
// we've already read the first 128 bytes into a buffer, but i can't see an obvious way to reuse them in a way that's
|
||||
// faster than simply moving the seek position back to the start of the file and re-reading the whole BUF_SIZE bytes.
|
||||
// for example, reading (BUF_SIZE - INITIAL_BUF_SIZE) bytes, then concatenating the original INITIAL_BUF_SIZE buffer
|
||||
// with this new one would presumably be faster - but it's not. i think it's more expensive to create the iterator,
|
||||
// collect the contents, etc. i'll have to look into this more. i don't at all doubt that there IS a way to do this
|
||||
// efficiently, and i can think of a way in principle, but i'm not sure how to express it in a way that is both
|
||||
// idiomatic/safe and fast.
|
||||
let mut buffer = [0; BUF_SIZE];
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
file.read(&mut buffer)?;
|
||||
Ok(db.get_type(&buffer))
|
||||
}
|
||||
|
||||
cached! {
|
||||
MIMEXT;
|
||||
fn mime_extension_lookup(essence: String) -> Option<Vec<String>> = {
|
||||
// Returns a list of known extensions for this mime type, if any.
|
||||
// This function uses the [Mime]'s "essence" rather than the [Mime] itself - mime_guess::get_mime_extensions ignores
|
||||
// the type suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. Passing the
|
||||
// essence_str (which includes the suffix) fixes this.
|
||||
// ↑ this is supposed to be a doc comment, but the cached! macro doesn't support that... i would switch to the
|
||||
// proc_macro version of cached, but it has a huge number of deps :c
|
||||
|
||||
let essence = essence.as_str();
|
||||
let mut exts = mime_guess::get_mime_extensions_str(essence);
|
||||
if exts.is_none() {
|
||||
// no matches :c
|
||||
// mime_guess' database isn't exactly perfect... there are a lot of times where the db will return "some/x-thing"
|
||||
// but mime_guess only understands "some/thing", or vice-versa.
|
||||
// so, if there appear to be no extensions, try replacing "some/x-thing" with "some/thing", or "some/thing" with
|
||||
// "some/x-thing".
|
||||
if essence.contains("/x-") {
|
||||
// replace e.g. "application/x-gzip" with "application/gzip"
|
||||
exts = mime_guess::get_mime_extensions_str(&essence.replace("/x-", "/"));
|
||||
} else {
|
||||
// replace e.g. "video/mp2t" with "video/x-mp2t"
|
||||
exts = mime_guess::get_mime_extensions_str(&essence.replace("/", "/x-"));
|
||||
}
|
||||
}
|
||||
|
||||
match exts {
|
||||
Some(exts) => {
|
||||
let possible_exts: Vec<String> = exts.iter().map(|e| String::from(*e)).collect();
|
||||
|
||||
Some(if essence == mime::IMAGE_JPEG.essence_str() {
|
||||
// possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are
|
||||
// far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can
|
||||
// add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif.
|
||||
[vec![String::from("jpg")], possible_exts].concat()
|
||||
|
||||
} else if essence == mime::TEXT_XML.essence_str() || essence == "application/xml" {
|
||||
// a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should
|
||||
// (in my opinion) be "xml".
|
||||
// there's also another problem: SVG files can easily be misidentified as XML files, because they usually
|
||||
// *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read
|
||||
// before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg"
|
||||
// as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered
|
||||
// to have valid extensions.
|
||||
// TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to
|
||||
// "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its
|
||||
// extension is classed as application/*+xml, consider it OK
|
||||
[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
|
||||
|
||||
} else if essence == "application/msword" {
|
||||
// classic office files considered harmful
|
||||
vec![String::from("doc"), String::from("xls"), String::from("ppt")]
|
||||
|
||||
} else if essence == "application/zip" {
|
||||
// neither xdg-mime nor infer seem to be able to detect office XML files properly...
|
||||
[vec![String::from("zip"), String::from("docx"), String::from("xlsx"), String::from("pptx")], possible_exts].concat()
|
||||
|
||||
} else if essence == "application/x-ms-dos-executable" {
|
||||
// both .dll and .exe files are given the same mime type... but you definitely don't want to rename one to the
|
||||
// other!
|
||||
[vec![String::from("dll"), String::from("exe")], possible_exts].concat()
|
||||
} else {
|
||||
possible_exts
|
||||
})
|
||||
},
|
||||
None => None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,7 +2,7 @@ use std::path::{Path, PathBuf};
|
|||
|
||||
use mime::Mime;
|
||||
|
||||
use crate::inspectors::mime_extension_lookup;
|
||||
use crate::files::mime_extension_lookup;
|
||||
use crate::String;
|
||||
|
||||
#[cfg(feature = "json")]
|
||||
|
@ -37,7 +37,7 @@ impl serde::Serialize for Findings {
|
|||
}
|
||||
|
||||
impl Findings {
|
||||
pub fn recommended_extension(&self) -> Option<String> {
|
||||
pub fn recommended_extension(&self) -> Option<std::string::String> {
|
||||
mime_extension_lookup(self.mime.essence_str().into()).map(|extensions| extensions[0].clone())
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,133 +0,0 @@
|
|||
//! Functions for getting the mime type and extension of a file.
|
||||
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::io::{Read, Seek, SeekFrom};
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
|
||||
use cached::cached;
|
||||
use mime::Mime;
|
||||
|
||||
use crate::String;
|
||||
use crate::MimeDb;
|
||||
|
||||
/// The number of bytes to read initially.
|
||||
///
|
||||
/// Rather than reading the entire file all at once into a [`BUF_SIZE`] buffer, it tends to be faster to read a small
|
||||
/// chunk of the file and trying to identify that, proceeding with the larger buffer if that fails. Many file formats
|
||||
/// can be identified with the first few dozen bytes, so the "happy path" will likely be taken in the majority of cases.
|
||||
pub const INITIAL_BUF_SIZE: usize = 128;
|
||||
|
||||
/// The number of bytes to read if the file couldn't be identified from its first [`INITIAL_BUF_SIZE`] bytes.
|
||||
pub const BUF_SIZE: usize = 8192;
|
||||
|
||||
/// Tries to identify the mimetype of a file from a given path.
|
||||
pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
|
||||
let mut buffer = [0; INITIAL_BUF_SIZE];
|
||||
let mut file = File::open(path)?;
|
||||
|
||||
// read a small amount to start with
|
||||
file.read(&mut buffer)?;
|
||||
|
||||
let r = db.get_type(&buffer).filter(|mime|
|
||||
// some mime types should be investigated further, reading up to BUF_SIZE even if they've been determined already
|
||||
// one such type is XML - there's many more specific types that can be determined by reading further (such as SVG)
|
||||
mime != &mime::TEXT_XML
|
||||
// another is ZIP - many file formats (DOCX, ODT, JAR...) are just ZIP files with particular data structures.
|
||||
// determining that a file is in one of the MS office formats in particular requires looking quite far into the
|
||||
// file.
|
||||
&& mime != &Mime::from_str("application/zip").unwrap()
|
||||
// doc/ppt/xls files are a subset of what's known as an "OLE2 compound document storage", at least according to
|
||||
// shared-mime-info. if a pre-OOXML era MS office file is scanned and identified as x-ole-storage, reading further
|
||||
// will allow it to be detected correctly as the appropriate filetype.
|
||||
&& mime != &Mime::from_str("application/x-ole-storage").unwrap());
|
||||
|
||||
if r.is_some() {
|
||||
return Ok(r);
|
||||
}
|
||||
|
||||
// attempt to read up to the BUF_SIZE bytes of the file.
|
||||
// we've already read the first 128 bytes into a buffer, but i can't see an obvious way to reuse them in a way that's
|
||||
// faster than simply moving the seek position back to the start of the file and re-reading the whole BUF_SIZE bytes.
|
||||
// for example, reading (BUF_SIZE - INITIAL_BUF_SIZE) bytes, then concatenating the original INITIAL_BUF_SIZE buffer
|
||||
// with this new one would presumably be faster - but it's not. i think it's more expensive to create the iterator,
|
||||
// collect the contents, etc. i'll have to look into this more. i don't at all doubt that there IS a way to do this
|
||||
// efficiently, and i can think of a way in principle, but i'm not sure how to express it in a way that is both
|
||||
// idiomatic/safe and fast.
|
||||
let mut buffer = [0; BUF_SIZE];
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
file.read(&mut buffer)?;
|
||||
Ok(db.get_type(&buffer))
|
||||
}
|
||||
|
||||
cached! {
|
||||
MIMEXT;
|
||||
fn mime_extension_lookup(essence: String) -> Option<Vec<String>> = {
|
||||
// Returns a list of known extensions for this mime type, if any.
|
||||
// This function uses the [Mime]'s "essence" rather than the [Mime] itself - mime_guess::get_mime_extensions ignores
|
||||
// the type suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. Passing the
|
||||
// essence_str (which includes the suffix) fixes this.
|
||||
// ↑ this is supposed to be a doc comment, but the cached! macro doesn't support that... i would switch to the
|
||||
// proc_macro version of cached, but it has a huge number of deps :c
|
||||
|
||||
let essence = essence.as_str();
|
||||
let mut exts = mime_guess::get_mime_extensions_str(essence);
|
||||
if exts.is_none() {
|
||||
// no matches :c
|
||||
// mime_guess' database isn't exactly perfect... there are a lot of times where the db will return "some/x-thing"
|
||||
// but mime_guess only understands "some/thing", or vice-versa.
|
||||
// so, if there appear to be no extensions, try replacing "some/x-thing" with "some/thing", or "some/thing" with
|
||||
// "some/x-thing".
|
||||
if essence.contains("/x-") {
|
||||
// replace e.g. "application/x-gzip" with "application/gzip"
|
||||
exts = mime_guess::get_mime_extensions_str(&essence.replace("/x-", "/"));
|
||||
} else {
|
||||
// replace e.g. "video/mp2t" with "video/x-mp2t"
|
||||
exts = mime_guess::get_mime_extensions_str(&essence.replace("/", "/x-"));
|
||||
}
|
||||
}
|
||||
|
||||
match exts {
|
||||
Some(exts) => {
|
||||
let possible_exts: Vec<String> = exts.iter().map(|e| String::from(*e)).collect();
|
||||
|
||||
Some(if essence == mime::IMAGE_JPEG.essence_str() {
|
||||
// possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are
|
||||
// far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can
|
||||
// add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif.
|
||||
[vec![String::from("jpg")], possible_exts].concat()
|
||||
|
||||
} else if essence == mime::TEXT_XML.essence_str() || essence == "application/xml" {
|
||||
// a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should
|
||||
// (in my opinion) be "xml".
|
||||
// there's also another problem: SVG files can easily be misidentified as XML files, because they usually
|
||||
// *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read
|
||||
// before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg"
|
||||
// as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered
|
||||
// to have valid extensions.
|
||||
// TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to
|
||||
// "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its
|
||||
// extension is classed as application/*+xml, consider it OK
|
||||
[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
|
||||
|
||||
} else if essence == "application/msword" {
|
||||
// classic office files considered harmful
|
||||
vec![String::from("doc"), String::from("xls"), String::from("ppt")]
|
||||
|
||||
} else if essence == "application/zip" {
|
||||
// neither xdg-mime nor infer seem to be able to detect office XML files properly...
|
||||
[vec![String::from("zip"), String::from("docx"), String::from("xlsx"), String::from("pptx")], possible_exts].concat()
|
||||
|
||||
} else if essence == "application/x-ms-dos-executable" {
|
||||
// both .dll and .exe files are given the same mime type... but you definitely don't want to rename one to the
|
||||
// other!
|
||||
[vec![String::from("dll"), String::from("exe")], possible_exts].concat()
|
||||
} else {
|
||||
possible_exts
|
||||
})
|
||||
},
|
||||
None => None
|
||||
}
|
||||
}
|
||||
}
|
|
@ -4,7 +4,6 @@
|
|||
pub mod files;
|
||||
pub mod findings;
|
||||
pub mod formats;
|
||||
pub mod inspectors;
|
||||
pub mod mime_db;
|
||||
pub mod parameters;
|
||||
pub mod utils;
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
//! [Clap] struct used to parse command line arguments.
|
||||
|
||||
use crate::String as StringType;
|
||||
use crate::utils::{clap_long_version, clap_version};
|
||||
use crate::String as StringType;
|
||||
use cfg_if::cfg_if;
|
||||
use clap::{AppSettings, Clap};
|
||||
use std::collections::BTreeSet;
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
use fif::files::{mime_extension_lookup, BUF_SIZE};
|
||||
use fif::files::{scan_directory, scan_from_walkdir};
|
||||
use fif::findings::Findings;
|
||||
use fif::formats::{Format, PowerShell, Shell};
|
||||
use fif::inspectors::{mime_extension_lookup, BUF_SIZE};
|
||||
use fif::mime_db::MimeDb;
|
||||
use fif::String;
|
||||
|
||||
|
@ -66,18 +66,19 @@ fn detect_type() {
|
|||
/// Ensure that `mime_extension_lookup` works as expected, and that the set of extensions for JPEG, PNG, PDF, and ZIP
|
||||
/// contain "jpg", "png", "pdf", and "zip", respectively.
|
||||
fn recommend_ext() {
|
||||
use std::string::String as StdString;
|
||||
assert!(mime_extension_lookup(IMAGE_JPEG.essence_str().into())
|
||||
.unwrap()
|
||||
.contains(&String::from("jpg")));
|
||||
.contains(&StdString::from("jpg")));
|
||||
assert!(mime_extension_lookup(IMAGE_PNG.essence_str().into())
|
||||
.unwrap()
|
||||
.contains(&String::from("png")));
|
||||
.contains(&StdString::from("png")));
|
||||
assert!(mime_extension_lookup(APPLICATION_PDF.essence_str().into())
|
||||
.unwrap()
|
||||
.contains(&String::from("pdf")));
|
||||
.contains(&StdString::from("pdf")));
|
||||
assert!(mime_extension_lookup(application_zip().essence_str().into())
|
||||
.unwrap()
|
||||
.contains(&String::from("zip")));
|
||||
.contains(&StdString::from("zip")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
Loading…
Reference in a new issue