Compare commits

..

No commits in common. "88b5070a03e9cb1673e52817a9793831f16bd4d0" and "6e2e788a614eb1e15abe414035b49f14a7fc1982" have entirely different histories.

14 changed files with 435 additions and 492 deletions

View file

@ -2,11 +2,6 @@
Dates are given in YYYY-MM-DD format.
## v0.3
### v0.3.7 (2021-MM-DD)
#### Other
- Refactoring - split fif into main.rs and lib.rs, moved file-related functionality (directory scanning, etc.) into
files module, removed string module, etc.
### v0.3.6 (2021-08-16)
#### Other
- Fixed another major dependency issue - [`clap`] version 3 beta 2 pulls in `clap_derive` version 3 beta **4**, causing

36
Cargo.lock generated
View file

@ -286,9 +286,9 @@ dependencies = [
[[package]]
name = "itoa"
version = "0.4.8"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
[[package]]
name = "lazy_static"
@ -311,9 +311,9 @@ dependencies = [
[[package]]
name = "libc"
version = "0.2.101"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3cb00336871be5ed2c8ed44b60ae9959dc5b9f08539422ed43f09e34ecaeba21"
checksum = "a7f823d141fe0a24df1e23b4af4e3c7ba9e5966ec514ea068c93024aa7deb765"
[[package]]
name = "log"
@ -461,9 +461,9 @@ dependencies = [
[[package]]
name = "proc-macro2"
version = "1.0.29"
version = "1.0.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9f5105d4fdaab20335ca9565e106a5d9b82b6219b5ba735731124ac6711d23d"
checksum = "5c7ed8b8c7b886ea3ed7dde405212185f423ab44682667c8c6dd14aa1d9f6612"
dependencies = [
"unicode-xid",
]
@ -593,18 +593,18 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "serde"
version = "1.0.130"
version = "1.0.127"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f12d06de37cf59146fbdecab66aa99f9fe4f78722e3607577a5375d66bd0c913"
checksum = "f03b9878abf6d14e6779d3f24f07b2cfa90352cfec4acc5aab8f1ac7f146fae8"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.130"
version = "1.0.127"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7bc1a1ab1961464eae040d96713baa5a724a8152c1222492465b54322ec508b"
checksum = "a024926d3432516606328597e0f224a51355a493b49fdd67e9209187cbe55ecc"
dependencies = [
"proc-macro2",
"quote",
@ -613,9 +613,9 @@ dependencies = [
[[package]]
name = "serde_json"
version = "1.0.67"
version = "1.0.66"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7f9e390c27c3c0ce8bc5d725f6e4d30a29d26659494aa4b17535f7522c5c950"
checksum = "336b10da19a12ad094b59d870ebde26a45402e5b470add4b5fd03c5048a32127"
dependencies = [
"itoa",
"ryu",
@ -655,9 +655,9 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "syn"
version = "1.0.75"
version = "1.0.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7f58f7e8eaa0009c5fec437aabf511bd9933e4b2d7407bd05273c01a8906ea7"
checksum = "1873d832550d4588c3dbc20f01361ab00bfe741048f71e3fecf145a7cc18b29c"
dependencies = [
"proc-macro2",
"quote",
@ -709,18 +709,18 @@ dependencies = [
[[package]]
name = "thiserror"
version = "1.0.28"
version = "1.0.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "283d5230e63df9608ac7d9691adc1dfb6e701225436eb64d0b9a7f0a5a04f6ec"
checksum = "93119e4feac1cbe6c798c34d3a53ea0026b0b1de6a120deef895137c0529bfe2"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.28"
version = "1.0.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa3884228611f5cd3608e2d409bf7dce832e4eb3135e3f11addbd7e41bd68e71"
checksum = "060d69a0afe7796bf42e9e2ff91f5ee691fb15c53d38b4b62a9a53eb23164745"
dependencies = [
"proc-macro2",
"quote",

View file

@ -25,7 +25,7 @@ xdg-mime-backend = ["xdg-mime"]
json = ["serde", "serde_json"]
[dependencies]
walkdir = "~2.3.2"
walkdir = "2.3.2"
log = "0.4.14"
mime = "0.3.16"
mime_guess = { package = "new_mime_guess", features = ["phf-map"], version = "3.0.0" }

View file

@ -8,7 +8,7 @@
[![Version](https://img.shields.io/crates/v/fif.svg?logo=rust&style=flat-square)
](https://crates.io/crates/fif)
[![Minimum Supported Rust Version](https://img.shields.io/badge/msrv-1.43.0-orange?logo=rust&style=flat-square)
](https://gitlab.com/Lynnesbian/fif/-/blob/master/README.md#version-policy)
](https://crates.io/crates/fif)
[![License](https://img.shields.io/crates/l/fif.svg?style=flat-square)
](https://gitlab.com/Lynnesbian/fif/-/blob/master/LICENSE)
[![Build status](https://img.shields.io/gitlab/pipeline/Lynnesbian/fif/master?logo=gitlab&style=flat-square)
@ -187,32 +187,3 @@ a more concise overview).
[`xdg-mime`]: https://crates.io/crates/xdg-mime
[`infer`]: https://crates.io/crates/infer
[Shared MIME Info]: https://gitlab.freedesktop.org/xdg/shared-mime-info/
## Version policy
fif adheres to the [semantic versioning](https://semver.org/) principles. While fif remains at version 0.x, the version
number will be updated as follows:
- The MAJOR version will be bumped to 1 when I believe fif to be "feature complete".
- The MINOR version will be bumped whenever I add a fairly important feature to fif (in the past, this has been bumped
when adding the ability to exclude extensions, and when fif gained the ability to output a bash script rather than a
list of invalid filenames). The MINOR version will also be bumped when increasing the MSRV.
- The PATCH version will be bumped in all other cases, including minor feature additions (in the past, this has occurred
when adding features such as more output formats and the ignore flag).
If/when fif hits version 1.0, these rules will likely remain the same as they are now.
## License
Copyright (C) 2021 Lynnesbian
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.

View file

@ -34,9 +34,6 @@ for backend in "${_backends[@]}"; do
-A clippy::multiple-crate-versions \
-A clippy::cast-possible-truncation \
-A clippy::cast-possible-wrap \
-A clippy::must_use_candidate \
-A clippy::missing_panics_doc \
-A clippy::missing_errors_doc \
"$_extra"
done
@ -46,8 +43,5 @@ done
# shadow_unrelated: sometimes things that seem unrelated are actually related ;)
# option_if_let_else: the suggested code is usually harder to read than the original
# multiple_crate_versions: cached uses an old version of hashbrown :c
# cast_possible_truncation: only ever used where it would be totally fine
# cast_possible_wrap: ditto
# must_use_candidate: useless
# missing_panics_doc: the docs are just for me, fif isn't really intended to be used as a library, so this is unneeded
# missing_errors_doc: ditto
# cast-possible-truncation: only ever used where it would be totally fine
# cast-possible-wrap: ditto

View file

@ -1,327 +0,0 @@
use crate::findings::{Findings, ScanError};
use crate::mime_db::MimeDb;
use crate::parameters::ScanOpts;
use crate::{String, MIMEDB};
use std::collections::BTreeSet;
use std::fs::File;
use std::io;
use std::io::{Read, Seek, SeekFrom};
use std::path::Path;
use std::str::FromStr;
use cached::cached;
use cfg_if::cfg_if;
use log::{debug, error};
use mime::Mime;
use mime_guess::from_ext;
use walkdir::{DirEntry, WalkDir};
cfg_if! {
if #[cfg(windows)] {
/// Determines whether or not a file is hidden by checking its win32 file attributes.
pub fn is_hidden(entry: &DirEntry) -> bool {
use std::os::windows::prelude::*;
std::fs::metadata(entry.path()) // try to get metadata for file
.map_or(
false, // if getting metadata/attributes fails, assume it's not hidden
|f| f.file_attributes() & 0x2 > 0, // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants
)
}
} else {
/// Determines whether or not a file is hidden by checking for a leading full stop.
pub fn is_hidden(entry: &DirEntry) -> bool {
entry
.file_name()
.to_str()
.map_or(false, |f| f.starts_with('.') && f != ".")
}
}
}
/// Returns `true` if a file matches the given criteria. This means checking whether the file's extension appears in
/// `exts` (if specified), potentially skipping over hidden files, and so on.
pub fn wanted_file(
entry: &DirEntry,
exts: Option<&BTreeSet<&str>>,
exclude: Option<&BTreeSet<&str>>,
scan_opts: &ScanOpts,
) -> bool {
if entry.depth() == 0 {
// the root directory should always be scanned.
return true;
}
if !scan_opts.hidden && is_hidden(entry) {
// skip hidden files and directories. this check is performed first because it's very lightweight.
return false;
}
if entry.file_type().is_dir() {
// always allow directories - there's no point doing file extension matching on something that isn't a file.
return true;
}
if let Some(ext) = entry.path().extension() {
// file has extension - discard invalid UTF-8 and normalise it to lowercase.
let ext = ext.to_string_lossy().to_lowercase();
let ext = ext.as_str();
if scan_opts.ignore_unknown_exts && from_ext(ext).is_empty() {
// unknown extension, skip.
return false;
}
if let Some(exts) = exts {
// only scan if the file has one of the specified extensions.
exts.contains(&ext)
} else {
// no extensions specified - the file should be scanned unless its extension is on the exclude list.
exclude.map_or(true, |exclude| !exclude.contains(&ext))
}
} else {
// no file extension
scan_opts.extensionless
}
}
/// Inspects the given entry, returning a [`Findings`] on success and a [`ScanError`] on failure.
///
/// In the event of an IO error, the returned [`ScanError`] will be of type [`ScanError::File`]. Otherwise, a
/// [`ScanError::Mime`] will be returned, meaning that the file was scanned successfully, but a mimetype could not be
/// determined.
pub fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result<Findings, ScanError> {
let path = entry.path();
// try to determine mimetype for this entry
let result = match mime_type(MIMEDB.get().unwrap(), path) {
// an error occurred while trying to read the file
Err(_) => return Err(ScanError::File(path)),
// the file was read successfully, but we were unable to determine its mimetype
Ok(None) => return Err(ScanError::Mime(path)),
// a mimetype was found!
Ok(Some(result)) => result,
};
// set of known extensions for the given mimetype
let known_exts = mime_extension_lookup(result.essence_str().into());
// file extension for this particular file
let entry_ext = path.extension();
let valid = match known_exts {
// there is a known set of extensions for this mimetype, and the file has an extension
Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_string_lossy().to_lowercase().into()),
// either this file has no extension, or there is no known set of extensions for this mimetype :(
Some(_) | None => false,
};
let path = if canonical_paths {
match std::fs::canonicalize(path) {
Ok(path) => path,
Err(_) => return Err(ScanError::File(entry.path())),
}
} else {
path.to_path_buf() // :c
};
Ok(Findings {
file: path,
valid,
mime: result,
})
}
/// Takes a slice of [`DirEntry`]s and calls [`scan_file`] on each one, returning the results in a vector.
pub fn scan_from_walkdir(entries: &[DirEntry], canonical_paths: bool) -> Vec<Result<Findings, ScanError>> {
cfg_if! {
if #[cfg(feature = "multi-threaded")] {
use rayon::prelude::*;
// split the entries into chunks of 32, and iterate over each chunk of entries in a separate thread
entries
.par_chunks(32)
.flat_map(|chunk| {
chunk
.iter() // iter over the chunk, which is a slice of DirEntry structs
.map(|entry| scan_file(entry, canonical_paths))
.collect::<Vec<_>>()
})
.collect()
} else {
entries.iter().map(|entry: &DirEntry| scan_file(entry, canonical_paths)).collect()
}
}
}
/// Scans a given directory with [`WalkDir`], filters with [`wanted_file`], checks for errors, and returns a vector of
/// [DirEntry]s.
pub fn scan_directory(
dirs: &Path,
exts: Option<&BTreeSet<&str>>,
exclude: Option<&BTreeSet<&str>>,
scan_opts: &ScanOpts,
) -> Option<Vec<DirEntry>> {
let stepper = WalkDir::new(dirs).follow_links(scan_opts.follow_symlinks).into_iter();
let mut probably_fatal_error = false;
let entries: Vec<DirEntry> = stepper
.filter_entry(|e| wanted_file(e, exts, exclude, scan_opts)) // filter out unwanted files
.filter_map(|e| {
if let Err(err) = &e {
debug!("uh oh spaghettio!! {:#?}", e);
// log errors to stdout, and remove them from the iterator
let path = err.path().map_or("General error".into(), Path::to_string_lossy);
if err.depth() == 0 {
// if something goes wrong while trying to read the root directory, we're probably not going to get much done
probably_fatal_error = true;
}
// TODO: is there a way to just say `map_or(x, |y| y).thing()` instead of `map_or(x.thing(), |y| y.thing())`?
// i don't care whether i'm returning a walkdir error or an io error, i just care about whether or not it
// implements ToString (which they both do). map_or doesn't work on trait objects though :(
error!(
"{}: {}",
path,
err.io_error().map_or(err.to_string(), |e| e.to_string())
);
return None;
}
e.ok()
})
// remove directories from the final list
.filter(|e| !e.file_type().is_dir())
// if fif is invoked without `-f` on a symlinked directory, it will recurse into the symlink (as desired) and ignore
// any symlinks inside the symlinked root directory. however, the root directory will still be added to `entries` as
// if it were a file to be scanned, and `scan_file` will fail to scan it, adding "Failed to read ~/whatever" to the
// output. to avoid this, we can remove all symlinks from `entries` if `-f` is not set. i know this is kind of
// confusing, but it's honestly kind of hard to explain... maybe a screenshot is better:
// https://i.imgur.com/DYG7jlB.png
// adding the symlink filter removes the line that's being pointed to in the image. 0u0
.filter(|e| scan_opts.follow_symlinks || !e.file_type().is_symlink())
.collect();
if probably_fatal_error {
None
} else {
Some(entries)
}
}
/// The number of bytes to read initially.
///
/// Rather than reading the entire file all at once into a [`BUF_SIZE`] buffer, it tends to be faster to read a small
/// chunk of the file and trying to identify that, proceeding with the larger buffer if that fails. Many file formats
/// can be identified with the first few dozen bytes, so the "happy path" will likely be taken in the majority of cases.
pub const INITIAL_BUF_SIZE: usize = 128;
/// The number of bytes to read if the file couldn't be identified from its first [`INITIAL_BUF_SIZE`] bytes.
pub const BUF_SIZE: usize = 8192;
/// Tries to identify the mimetype of a file from a given path.
pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
let mut buffer = [0; INITIAL_BUF_SIZE];
let mut file = File::open(path)?;
// read a small amount to start with
file.read(&mut buffer)?;
let r = db.get_type(&buffer).filter(|mime|
// some mime types should be investigated further, reading up to BUF_SIZE even if they've been determined already
// one such type is XML - there's many more specific types that can be determined by reading further (such as SVG)
mime != &mime::TEXT_XML
// another is ZIP - many file formats (DOCX, ODT, JAR...) are just ZIP files with particular data structures.
// determining that a file is in one of the MS office formats in particular requires looking quite far into the
// file.
&& mime != &Mime::from_str("application/zip").unwrap()
// doc/ppt/xls files are a subset of what's known as an "OLE2 compound document storage", at least according to
// shared-mime-info. if a pre-OOXML era MS office file is scanned and identified as x-ole-storage, reading further
// will allow it to be detected correctly as the appropriate filetype.
&& mime != &Mime::from_str("application/x-ole-storage").unwrap());
if r.is_some() {
return Ok(r);
}
// attempt to read up to the BUF_SIZE bytes of the file.
// we've already read the first 128 bytes into a buffer, but i can't see an obvious way to reuse them in a way that's
// faster than simply moving the seek position back to the start of the file and re-reading the whole BUF_SIZE bytes.
// for example, reading (BUF_SIZE - INITIAL_BUF_SIZE) bytes, then concatenating the original INITIAL_BUF_SIZE buffer
// with this new one would presumably be faster - but it's not. i think it's more expensive to create the iterator,
// collect the contents, etc. i'll have to look into this more. i don't at all doubt that there IS a way to do this
// efficiently, and i can think of a way in principle, but i'm not sure how to express it in a way that is both
// idiomatic/safe and fast.
let mut buffer = [0; BUF_SIZE];
file.seek(SeekFrom::Start(0))?;
file.read(&mut buffer)?;
Ok(db.get_type(&buffer))
}
cached! {
MIMEXT;
fn mime_extension_lookup(essence: String) -> Option<Vec<String>> = {
// Returns a list of known extensions for this mime type, if any.
// This function uses the [Mime]'s "essence" rather than the [Mime] itself - mime_guess::get_mime_extensions ignores
// the type suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. Passing the
// essence_str (which includes the suffix) fixes this.
// ↑ this is supposed to be a doc comment, but the cached! macro doesn't support that... i would switch to the
// proc_macro version of cached, but it has a huge number of deps :c
let essence = essence.as_str();
let mut exts = mime_guess::get_mime_extensions_str(essence);
if exts.is_none() {
// no matches :c
// mime_guess' database isn't exactly perfect... there are a lot of times where the db will return "some/x-thing"
// but mime_guess only understands "some/thing", or vice-versa.
// so, if there appear to be no extensions, try replacing "some/x-thing" with "some/thing", or "some/thing" with
// "some/x-thing".
if essence.contains("/x-") {
// replace e.g. "application/x-gzip" with "application/gzip"
exts = mime_guess::get_mime_extensions_str(&essence.replace("/x-", "/"));
} else {
// replace e.g. "video/mp2t" with "video/x-mp2t"
exts = mime_guess::get_mime_extensions_str(&essence.replace("/", "/x-"));
}
}
match exts {
Some(exts) => {
let possible_exts: Vec<String> = exts.iter().map(|e| String::from(*e)).collect();
Some(if essence == mime::IMAGE_JPEG.essence_str() {
// possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are
// far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can
// add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif.
[vec![String::from("jpg")], possible_exts].concat()
} else if essence == mime::TEXT_XML.essence_str() || essence == "application/xml" {
// a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should
// (in my opinion) be "xml".
// there's also another problem: SVG files can easily be misidentified as XML files, because they usually
// *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read
// before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg"
// as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered
// to have valid extensions.
// TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to
// "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its
// extension is classed as application/*+xml, consider it OK
[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
} else if essence == "application/msword" {
// classic office files considered harmful
vec![String::from("doc"), String::from("xls"), String::from("ppt")]
} else if essence == "application/zip" {
// neither xdg-mime nor infer seem to be able to detect office XML files properly...
[vec![String::from("zip"), String::from("docx"), String::from("xlsx"), String::from("pptx")], possible_exts].concat()
} else if essence == "application/x-ms-dos-executable" {
// both .dll and .exe files are given the same mime type... but you definitely don't want to rename one to the
// other!
[vec![String::from("dll"), String::from("exe")], possible_exts].concat()
} else {
possible_exts
})
},
None => None
}
}
}

View file

@ -2,8 +2,8 @@ use std::path::{Path, PathBuf};
use mime::Mime;
use crate::files::mime_extension_lookup;
use crate::String;
use crate::inspectors::mime_extension_lookup;
use crate::string_type::String;
#[cfg(feature = "json")]
use serde::{ser::SerializeStruct, Serializer};

View file

@ -12,21 +12,17 @@ use snailquote::escape;
use crate::findings::ScanError;
use crate::utils::clap_long_version;
use crate::Findings;
use itertools::{Either, Itertools};
use itertools::Itertools;
/// A macro for creating an array of `Writable`s without needing to pepper your code with `into()`s.
/// # Usage
/// ```
/// use crate::fif::writables;
/// use crate::fif::formats::{Writable, smart_write};
/// let mut f = std::io::stdout();
///
/// let f = std::io::stdout();
/// // Instead of...
/// smart_write(&mut f, &["hello".into(), Writable::Newline]);
/// smart_write(f, &["hello".into(), Writable::Newline]);
/// // ...just use:
/// smart_write(&mut f, writables!["hello", Newline]);
/// smart_write(f, writables!["hello", Newline]);
/// ```
#[macro_export]
macro_rules! writables {
[$($args:tt),+] => {
@ -75,7 +71,7 @@ impl<'a> From<&'a OsStr> for Writable<'a> {
fn generated_by() -> String { format!("Generated by fif {}", clap_long_version()) }
pub fn smart_write<W: Write>(f: &mut W, writeables: &[Writable]) -> io::Result<()> {
fn smart_write<W: Write>(f: &mut W, writeables: &[Writable]) -> io::Result<()> {
// ehhhh
for writeable in writeables {
match writeable {
@ -140,12 +136,12 @@ pub trait FormatSteps {
// sort errors so unreadable files appear before files with unknown mimetypes - ScanError impls Ord such that
// ScanError::File > ScanError::Mime
let errors = entries.iter().filter_map(|e| e.as_ref().err()).sorted_unstable();
let errors = entries.iter().filter_map(|e| e.as_ref().err()).sorted();
// sort files so that files with no known extension come before those with known extensions - None > Some("jpg")
let findings = entries
.iter()
.filter_map(|e| e.as_ref().ok())
.sorted_unstable_by(|a, b| b.recommended_extension().cmp(&a.recommended_extension()).reverse());
.sorted_by(|a, b| b.recommended_extension().cmp(&a.recommended_extension()).reverse());
for error in errors {
match error {
@ -344,12 +340,13 @@ impl Format for Json {
findings: &'a Vec<&'a Findings>,
}
let (errors, findings) = &entries.iter().partition_map(|entry| match entry {
Err(e) => Either::Left(e),
Ok(f) => Either::Right(f),
});
let result = serde_json::to_writer_pretty(f, &SerdeEntries { errors, findings });
let result = serde_json::to_writer_pretty(
f,
&SerdeEntries {
errors: &entries.iter().filter_map(|e| e.as_ref().err()).sorted().collect(),
findings: &entries.iter().filter_map(|f| f.as_ref().ok()).sorted().collect(),
},
);
if let Err(err) = result {
log::error!("Error while serialising: {}", err);

133
src/inspectors.rs Normal file
View file

@ -0,0 +1,133 @@
//! Functions for getting the mime type and extension of a file.
use std::fs::File;
use std::io;
use std::io::{Read, Seek, SeekFrom};
use std::path::Path;
use std::str::FromStr;
use cached::cached;
use mime::Mime;
use crate::mime_db::MimeDb;
use crate::string_type::String;
/// The number of bytes to read initially.
///
/// Rather than reading the entire file all at once into a [`BUF_SIZE`] buffer, it tends to be faster to read a small
/// chunk of the file and trying to identify that, proceeding with the larger buffer if that fails. Many file formats
/// can be identified with the first few dozen bytes, so the "happy path" will likely be taken in the majority of cases.
pub const INITIAL_BUF_SIZE: usize = 128;
/// The number of bytes to read if the file couldn't be identified from its first [`INITIAL_BUF_SIZE`] bytes.
pub const BUF_SIZE: usize = 8192;
/// Tries to identify the mimetype of a file from a given path.
pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
let mut buffer = [0; INITIAL_BUF_SIZE];
let mut file = File::open(path)?;
// read a small amount to start with
file.read(&mut buffer)?;
let r = db.get_type(&buffer).filter(|mime|
// some mime types should be investigated further, reading up to BUF_SIZE even if they've been determined already
// one such type is XML - there's many more specific types that can be determined by reading further (such as SVG)
mime != &mime::TEXT_XML
// another is ZIP - many file formats (DOCX, ODT, JAR...) are just ZIP files with particular data structures.
// determining that a file is in one of the MS office formats in particular requires looking quite far into the
// file.
&& mime != &Mime::from_str("application/zip").unwrap()
// doc/ppt/xls files are a subset of what's known as an "OLE2 compound document storage", at least according to
// shared-mime-info. if a pre-OOXML era MS office file is scanned and identified as x-ole-storage, reading further
// will allow it to be detected correctly as the appropriate filetype.
&& mime != &Mime::from_str("application/x-ole-storage").unwrap());
if r.is_some() {
return Ok(r);
}
// attempt to read up to the BUF_SIZE bytes of the file.
// we've already read the first 128 bytes into a buffer, but i can't see an obvious way to reuse them in a way that's
// faster than simply moving the seek position back to the start of the file and re-reading the whole BUF_SIZE bytes.
// for example, reading (BUF_SIZE - INITIAL_BUF_SIZE) bytes, then concatenating the original INITIAL_BUF_SIZE buffer
// with this new one would presumably be faster - but it's not. i think it's more expensive to create the iterator,
// collect the contents, etc. i'll have to look into this more. i don't at all doubt that there IS a way to do this
// efficiently, and i can think of a way in principle, but i'm not sure how to express it in a way that is both
// idiomatic/safe and fast.
let mut buffer = [0; BUF_SIZE];
file.seek(SeekFrom::Start(0))?;
file.read(&mut buffer)?;
Ok(db.get_type(&buffer))
}
cached! {
MIMEXT;
fn mime_extension_lookup(essence: String) -> Option<Vec<String>> = {
// Returns a list of known extensions for this mime type, if any.
// This function uses the [Mime]'s "essence" rather than the [Mime] itself - mime_guess::get_mime_extensions ignores
// the type suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. Passing the
// essence_str (which includes the suffix) fixes this.
// ↑ this is supposed to be a doc comment, but the cached! macro doesn't support that... i would switch to the
// proc_macro version of cached, but it has a huge number of deps :c
let essence = essence.as_str();
let mut exts = mime_guess::get_mime_extensions_str(essence);
if exts.is_none() {
// no matches :c
// mime_guess' database isn't exactly perfect... there are a lot of times where the db will return "some/x-thing"
// but mime_guess only understands "some/thing", or vice-versa.
// so, if there appear to be no extensions, try replacing "some/x-thing" with "some/thing", or "some/thing" with
// "some/x-thing".
if essence.contains("/x-") {
// replace e.g. "application/x-gzip" with "application/gzip"
exts = mime_guess::get_mime_extensions_str(&essence.replace("/x-", "/"));
} else {
// replace e.g. "video/mp2t" with "video/x-mp2t"
exts = mime_guess::get_mime_extensions_str(&essence.replace("/", "/x-"));
}
}
match exts {
Some(exts) => {
let possible_exts: Vec<String> = exts.iter().map(|e| String::from(*e)).collect();
Some(if essence == mime::IMAGE_JPEG.essence_str() {
// possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are
// far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can
// add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif.
[vec![String::from("jpg")], possible_exts].concat()
} else if essence == mime::TEXT_XML.essence_str() || essence == "application/xml" {
// a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should
// (in my opinion) be "xml".
// there's also another problem: SVG files can easily be misidentified as XML files, because they usually
// *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read
// before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg"
// as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered
// to have valid extensions.
// TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to
// "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its
// extension is classed as application/*+xml, consider it OK
[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
} else if essence == "application/msword" {
// classic office files considered harmful
vec![String::from("doc"), String::from("xls"), String::from("ppt")]
} else if essence == "application/zip" {
// neither xdg-mime nor infer seem to be able to detect office XML files properly...
[vec![String::from("zip"), String::from("docx"), String::from("xlsx"), String::from("pptx")], possible_exts].concat()
} else if essence == "application/x-ms-dos-executable" {
// both .dll and .exe files are given the same mime type... but you definitely don't want to rename one to the
// other!
[vec![String::from("dll"), String::from("exe")], possible_exts].concat()
} else {
possible_exts
})
},
None => None
}
}
}

View file

@ -1,52 +0,0 @@
#![forbid(unsafe_code)]
#![warn(trivial_casts, unused_lifetimes, unused_qualifications)]
pub mod files;
pub mod findings;
pub mod formats;
pub mod mime_db;
pub mod parameters;
pub mod utils;
use crate::findings::Findings;
use crate::mime_db::MimeDb;
use cfg_if::cfg_if;
use once_cell::sync::OnceCell;
cfg_if! {
if #[cfg(not(all(target_endian = "big", target_pointer_width = "32")))] {
// most architectures
pub use smartstring::alias::String;
} else {
// powerpc and other big endian 32-bit archs
pub use std::string::String;
}
}
cfg_if! {
if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] {
/// A [OnceCell] holding an instance of [mime_db::MimeDb].
pub static MIMEDB: OnceCell<mime_db::InferDb> = OnceCell::new();
} else {
/// A [OnceCell] holding an instance of [mime_db::MimeDb].
pub static MIMEDB: OnceCell<mime_db::XdgDb> = OnceCell::new();
}
}
/// Initialises [`MIMEDB`] with a value dependent on the current backend.
pub fn init_db() {
cfg_if! {
if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] {
MIMEDB
.set(crate::mime_db::InferDb::init())
.or(Err("Failed to initialise Infer backend!"))
.unwrap();
} else {
MIMEDB
.set(crate::mime_db::XdgDb::init())
.or(Err("Failed to initialise XDG Mime backend!"))
.unwrap();
}
}
}

View file

@ -18,19 +18,44 @@
#![warn(trivial_casts, unused_lifetimes, unused_qualifications)]
use std::io::{stdout, BufWriter, Write};
use std::path::Path;
use std::process::exit;
use cfg_if::cfg_if;
use clap::Clap;
use log::{debug, error, info, trace, warn, Level};
use once_cell::sync::OnceCell;
use walkdir::{DirEntry, WalkDir};
use fif::files::{scan_directory, scan_from_walkdir};
use fif::formats::Format;
use fif::parameters::OutputFormat;
use fif::utils::{clap_long_version, os_name};
use fif::{formats, init_db, parameters};
use crate::findings::Findings;
use crate::findings::ScanError;
use crate::formats::Format;
use crate::mime_db::MimeDb;
use crate::parameters::{OutputFormat, ScanOpts};
use crate::utils::{clap_long_version, os_name};
use mime_guess::from_ext;
use std::collections::BTreeSet;
mod findings;
mod formats;
mod inspectors;
mod mime_db;
mod parameters;
pub(crate) mod string_type;
#[cfg(test)]
mod tests;
mod utils;
cfg_if! {
if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] {
/// A [OnceCell] holding an instance of [mime_db::MimeDb].
static MIMEDB: OnceCell<mime_db::InferDb> = OnceCell::new();
} else {
/// A [OnceCell] holding an instance of [mime_db::MimeDb].
static MIMEDB: OnceCell<mime_db::XdgDb> = OnceCell::new();
}
}
#[doc(hidden)]
#[allow(clippy::cognitive_complexity)]
@ -142,3 +167,209 @@ fn main() {
debug!("Done");
}
cfg_if! {
if #[cfg(windows)] {
/// Determines whether or not a file is hidden by checking its win32 file attributes.
fn is_hidden(entry: &DirEntry) -> bool {
use std::os::windows::prelude::*;
std::fs::metadata(entry.path()) // try to get metadata for file
.map_or(
false, // if getting metadata/attributes fails, assume it's not hidden
|f| f.file_attributes() & 0x2 > 0, // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants
)
}
} else {
/// Determines whether or not a file is hidden by checking for a leading full stop.
fn is_hidden(entry: &DirEntry) -> bool {
entry
.file_name()
.to_str()
.map_or(false, |f| f.starts_with('.') && f != ".")
}
}
}
/// Returns `true` if a file matches the given criteria. This means checking whether the file's extension appears in
/// `exts` (if specified), potentially skipping over hidden files, and so on.
fn wanted_file(
entry: &DirEntry,
exts: Option<&BTreeSet<&str>>,
exclude: Option<&BTreeSet<&str>>,
scan_opts: &ScanOpts,
) -> bool {
if entry.depth() == 0 {
// the root directory should always be scanned.
return true;
}
if !scan_opts.hidden && is_hidden(entry) {
// skip hidden files and directories. this check is performed first because it's very lightweight.
return false;
}
if entry.file_type().is_dir() {
// always allow directories - there's no point doing file extension matching on something that isn't a file.
return true;
}
if let Some(ext) = entry.path().extension() {
// file has extension - discard invalid UTF-8 and normalise it to lowercase.
let ext = ext.to_string_lossy().to_lowercase();
let ext = ext.as_str();
if scan_opts.ignore_unknown_exts && from_ext(ext).is_empty() {
// unknown extension, skip.
return false;
}
if let Some(exts) = exts {
// only scan if the file has one of the specified extensions.
exts.contains(&ext)
} else {
// no extensions specified - the file should be scanned unless its extension is on the exclude list.
exclude.map_or(true, |exclude| !exclude.contains(&ext))
}
} else {
// no file extension
scan_opts.extensionless
}
}
/// Inspects the given entry, returning a [`Findings`] on success and a [`ScanError`] on failure.
///
/// In the event of an IO error, the returned [`ScanError`] will be of type [`ScanError::File`]. Otherwise, a
/// [`ScanError::Mime`] will be returned, meaning that the file was scanned successfully, but a mimetype could not be
/// determined.
fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result<Findings, ScanError> {
let path = entry.path();
// try to determine mimetype for this entry
let result = match inspectors::mime_type(MIMEDB.get().unwrap(), path) {
// an error occurred while trying to read the file
Err(_) => return Err(ScanError::File(path)),
// the file was read successfully, but we were unable to determine its mimetype
Ok(None) => return Err(ScanError::Mime(path)),
// a mimetype was found!
Ok(Some(result)) => result,
};
// set of known extensions for the given mimetype
let known_exts = inspectors::mime_extension_lookup(result.essence_str().into());
// file extension for this particular file
let entry_ext = path.extension();
let valid = match known_exts {
// there is a known set of extensions for this mimetype, and the file has an extension
Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_string_lossy().to_lowercase().into()),
// either this file has no extension, or there is no known set of extensions for this mimetype :(
Some(_) | None => false,
};
let path = if canonical_paths {
match std::fs::canonicalize(path) {
Ok(path) => path,
Err(_) => return Err(ScanError::File(entry.path())),
}
} else {
path.to_path_buf() // :c
};
Ok(Findings {
file: path,
valid,
mime: result,
})
}
/// Takes a slice of [`DirEntry`]s and calls [`scan_file`] on each one, returning the results in a vector.
fn scan_from_walkdir(entries: &[DirEntry], canonical_paths: bool) -> Vec<Result<Findings, ScanError>> {
cfg_if! {
if #[cfg(feature = "multi-threaded")] {
use rayon::prelude::*;
// split the entries into chunks of 32, and iterate over each chunk of entries in a separate thread
entries
.par_chunks(32)
.flat_map(|chunk| {
chunk
.iter() // iter over the chunk, which is a slice of DirEntry structs
.map(|entry| scan_file(entry, canonical_paths))
.collect::<Vec<_>>()
})
.collect()
} else {
entries.iter().map(|entry: &DirEntry| scan_file(entry, canonical_paths)).collect()
}
}
}
/// Scans a given directory with [`WalkDir`], filters with [`wanted_file`], checks for errors, and returns a vector of
/// [DirEntry]s.
fn scan_directory(
dirs: &Path,
exts: Option<&BTreeSet<&str>>,
exclude: Option<&BTreeSet<&str>>,
scan_opts: &ScanOpts,
) -> Option<Vec<DirEntry>> {
let stepper = WalkDir::new(dirs).follow_links(scan_opts.follow_symlinks).into_iter();
let mut probably_fatal_error = false;
let entries: Vec<DirEntry> = stepper
.filter_entry(|e| wanted_file(e, exts, exclude, scan_opts)) // filter out unwanted files
.filter_map(|e| {
if let Err(err) = &e {
debug!("uh oh spaghettio!! {:#?}", e);
// log errors to stdout, and remove them from the iterator
let path = err.path().map_or("General error".into(), Path::to_string_lossy);
if err.depth() == 0 {
// if something goes wrong while trying to read the root directory, we're probably not going to get much done
probably_fatal_error = true;
}
// TODO: is there a way to just say `map_or(x, |y| y).thing()` instead of `map_or(x.thing(), |y| y.thing())`?
// i don't care whether i'm returning a walkdir error or an io error, i just care about whether or not it
// implements ToString (which they both do). map_or doesn't work on trait objects though :(
error!(
"{}: {}",
path,
err.io_error().map_or(err.to_string(), |e| e.to_string())
);
return None;
}
e.ok()
})
// remove directories from the final list
.filter(|e| !e.file_type().is_dir())
// if fif is invoked without `-f` on a symlinked directory, it will recurse into the symlink (as desired) and ignore
// any symlinks inside the symlinked root directory. however, the root directory will still be added to `entries` as
// if it were a file to be scanned, and `scan_file` will fail to scan it, adding "Failed to read ~/whatever" to the
// output. to avoid this, we can remove all symlinks from `entries` if `-f` is not set. i know this is kind of
// confusing, but it's honestly kind of hard to explain... maybe a screenshot is better:
// https://i.imgur.com/DYG7jlB.png
// adding the symlink filter removes the line that's being pointed to in the image. 0u0
.filter(|e| scan_opts.follow_symlinks || !e.file_type().is_symlink())
.collect();
if probably_fatal_error {
None
} else {
Some(entries)
}
}
/// Initialises [`MIMEDB`] with a value dependent on the current backend.
fn init_db() {
cfg_if! {
if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] {
MIMEDB
.set(mime_db::InferDb::init())
.or(Err("Failed to initialise Infer backend!"))
.unwrap();
} else {
MIMEDB
.set(mime_db::XdgDb::init())
.or(Err("Failed to initialise XDG Mime backend!"))
.unwrap();
}
}
}

View file

@ -1,7 +1,7 @@
//! [Clap] struct used to parse command line arguments.
use crate::string_type::String as StringType;
use crate::utils::{clap_long_version, clap_version};
use crate::String as StringType;
use cfg_if::cfg_if;
use clap::{AppSettings, Clap};
use std::collections::BTreeSet;

11
src/string_type.rs Normal file
View file

@ -0,0 +1,11 @@
use cfg_if::cfg_if;
cfg_if! {
if #[cfg(not(all(target_endian = "big", target_pointer_width = "32")))] {
// most architectures
pub use smartstring::alias::String;
} else {
// powerpc and other big endian 32-bit archs
pub use std::string::String;
}
}

View file

@ -1,9 +1,9 @@
use fif::files::{mime_extension_lookup, BUF_SIZE};
use fif::files::{scan_directory, scan_from_walkdir};
use fif::findings::Findings;
use fif::formats::{Format, PowerShell, Shell};
use fif::mime_db::MimeDb;
use fif::String;
use crate::findings::Findings;
use crate::formats::{Format, PowerShell, Shell};
use crate::inspectors::{mime_extension_lookup, BUF_SIZE};
use crate::mime_db::MimeDb;
use crate::string_type::String;
use crate::{scan_directory, scan_from_walkdir};
use crate::parameters::Parameters;
use clap::Clap;
@ -21,12 +21,12 @@ const ZIP_BYTES: &[u8] = b"PK\x03\x04";
cfg_if::cfg_if! {
if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] {
fn get_mime_db() -> fif::mime_db::InferDb {
fif::mime_db::InferDb::init()
fn get_mime_db() -> crate::mime_db::InferDb {
crate::mime_db::InferDb::init()
}
} else {
fn get_mime_db() -> fif::mime_db::XdgDb {
fif::mime_db::XdgDb::init()
fn get_mime_db() -> crate::mime_db::XdgDb {
crate::mime_db::XdgDb::init()
}
}
}
@ -333,7 +333,10 @@ fn identify_random_bytes() {
for (mime, count) in &results {
println!("{}:\t{} counts", mime, count);
}
println!("No type found:\t{} counts", 1000 - results.values().sum::<i32>());
println!(
"No type found:\t{} counts",
results.values().len() as i32 - results.values().sum::<i32>()
);
}
#[test]
@ -429,8 +432,8 @@ fn media_contains_audio_video_images() {
#[test]
/// Ensure that the `writables!` macro produces the output it should.
fn writables_is_correct() {
use fif::formats::Writable;
use fif::writables;
use crate::formats::Writable;
use crate::writables;
assert_eq!(
&["henlo".into(), Path::new("henlo").into(), Writable::Newline,],
@ -462,16 +465,3 @@ fn verbosity() {
assert_eq!(Parameters::parse_from(&["fif", flags]).default_verbosity(), level);
}
}
#[test]
/// Ensures that smart strings don't deviate from std's Strings
fn validate_string_type() {
use fif::String as SmartString;
use std::string::String as StdString;
assert_eq!(SmartString::new(), StdString::new());
assert_eq!(SmartString::from("smol"), StdString::from("smol"));
assert_eq!(
SmartString::from("A long and therefore heap-allocated string"),
StdString::from("A long and therefore heap-allocated string")
);
}