Compare commits
No commits in common. "88b5070a03e9cb1673e52817a9793831f16bd4d0" and "6e2e788a614eb1e15abe414035b49f14a7fc1982" have entirely different histories.
88b5070a03
...
6e2e788a61
14 changed files with 435 additions and 492 deletions
|
@ -2,11 +2,6 @@
|
||||||
Dates are given in YYYY-MM-DD format.
|
Dates are given in YYYY-MM-DD format.
|
||||||
|
|
||||||
## v0.3
|
## v0.3
|
||||||
### v0.3.7 (2021-MM-DD)
|
|
||||||
#### Other
|
|
||||||
- Refactoring - split fif into main.rs and lib.rs, moved file-related functionality (directory scanning, etc.) into
|
|
||||||
files module, removed string module, etc.
|
|
||||||
|
|
||||||
### v0.3.6 (2021-08-16)
|
### v0.3.6 (2021-08-16)
|
||||||
#### Other
|
#### Other
|
||||||
- Fixed another major dependency issue - [`clap`] version 3 beta 2 pulls in `clap_derive` version 3 beta **4**, causing
|
- Fixed another major dependency issue - [`clap`] version 3 beta 2 pulls in `clap_derive` version 3 beta **4**, causing
|
||||||
|
|
36
Cargo.lock
generated
36
Cargo.lock
generated
|
@ -286,9 +286,9 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "itoa"
|
name = "itoa"
|
||||||
version = "0.4.8"
|
version = "0.4.7"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
|
checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lazy_static"
|
name = "lazy_static"
|
||||||
|
@ -311,9 +311,9 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libc"
|
name = "libc"
|
||||||
version = "0.2.101"
|
version = "0.2.99"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "3cb00336871be5ed2c8ed44b60ae9959dc5b9f08539422ed43f09e34ecaeba21"
|
checksum = "a7f823d141fe0a24df1e23b4af4e3c7ba9e5966ec514ea068c93024aa7deb765"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "log"
|
name = "log"
|
||||||
|
@ -461,9 +461,9 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "proc-macro2"
|
name = "proc-macro2"
|
||||||
version = "1.0.29"
|
version = "1.0.28"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b9f5105d4fdaab20335ca9565e106a5d9b82b6219b5ba735731124ac6711d23d"
|
checksum = "5c7ed8b8c7b886ea3ed7dde405212185f423ab44682667c8c6dd14aa1d9f6612"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"unicode-xid",
|
"unicode-xid",
|
||||||
]
|
]
|
||||||
|
@ -593,18 +593,18 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde"
|
name = "serde"
|
||||||
version = "1.0.130"
|
version = "1.0.127"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f12d06de37cf59146fbdecab66aa99f9fe4f78722e3607577a5375d66bd0c913"
|
checksum = "f03b9878abf6d14e6779d3f24f07b2cfa90352cfec4acc5aab8f1ac7f146fae8"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"serde_derive",
|
"serde_derive",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_derive"
|
name = "serde_derive"
|
||||||
version = "1.0.130"
|
version = "1.0.127"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d7bc1a1ab1961464eae040d96713baa5a724a8152c1222492465b54322ec508b"
|
checksum = "a024926d3432516606328597e0f224a51355a493b49fdd67e9209187cbe55ecc"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
|
@ -613,9 +613,9 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_json"
|
name = "serde_json"
|
||||||
version = "1.0.67"
|
version = "1.0.66"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a7f9e390c27c3c0ce8bc5d725f6e4d30a29d26659494aa4b17535f7522c5c950"
|
checksum = "336b10da19a12ad094b59d870ebde26a45402e5b470add4b5fd03c5048a32127"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"itoa",
|
"itoa",
|
||||||
"ryu",
|
"ryu",
|
||||||
|
@ -655,9 +655,9 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "syn"
|
name = "syn"
|
||||||
version = "1.0.75"
|
version = "1.0.74"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b7f58f7e8eaa0009c5fec437aabf511bd9933e4b2d7407bd05273c01a8906ea7"
|
checksum = "1873d832550d4588c3dbc20f01361ab00bfe741048f71e3fecf145a7cc18b29c"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
|
@ -709,18 +709,18 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "thiserror"
|
name = "thiserror"
|
||||||
version = "1.0.28"
|
version = "1.0.26"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "283d5230e63df9608ac7d9691adc1dfb6e701225436eb64d0b9a7f0a5a04f6ec"
|
checksum = "93119e4feac1cbe6c798c34d3a53ea0026b0b1de6a120deef895137c0529bfe2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"thiserror-impl",
|
"thiserror-impl",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "thiserror-impl"
|
name = "thiserror-impl"
|
||||||
version = "1.0.28"
|
version = "1.0.26"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "fa3884228611f5cd3608e2d409bf7dce832e4eb3135e3f11addbd7e41bd68e71"
|
checksum = "060d69a0afe7796bf42e9e2ff91f5ee691fb15c53d38b4b62a9a53eb23164745"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
|
|
|
@ -25,7 +25,7 @@ xdg-mime-backend = ["xdg-mime"]
|
||||||
json = ["serde", "serde_json"]
|
json = ["serde", "serde_json"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
walkdir = "~2.3.2"
|
walkdir = "2.3.2"
|
||||||
log = "0.4.14"
|
log = "0.4.14"
|
||||||
mime = "0.3.16"
|
mime = "0.3.16"
|
||||||
mime_guess = { package = "new_mime_guess", features = ["phf-map"], version = "3.0.0" }
|
mime_guess = { package = "new_mime_guess", features = ["phf-map"], version = "3.0.0" }
|
||||||
|
|
31
README.md
31
README.md
|
@ -8,7 +8,7 @@
|
||||||
[![Version](https://img.shields.io/crates/v/fif.svg?logo=rust&style=flat-square)
|
[![Version](https://img.shields.io/crates/v/fif.svg?logo=rust&style=flat-square)
|
||||||
](https://crates.io/crates/fif)
|
](https://crates.io/crates/fif)
|
||||||
[![Minimum Supported Rust Version](https://img.shields.io/badge/msrv-1.43.0-orange?logo=rust&style=flat-square)
|
[![Minimum Supported Rust Version](https://img.shields.io/badge/msrv-1.43.0-orange?logo=rust&style=flat-square)
|
||||||
](https://gitlab.com/Lynnesbian/fif/-/blob/master/README.md#version-policy)
|
](https://crates.io/crates/fif)
|
||||||
[![License](https://img.shields.io/crates/l/fif.svg?style=flat-square)
|
[![License](https://img.shields.io/crates/l/fif.svg?style=flat-square)
|
||||||
](https://gitlab.com/Lynnesbian/fif/-/blob/master/LICENSE)
|
](https://gitlab.com/Lynnesbian/fif/-/blob/master/LICENSE)
|
||||||
[![Build status](https://img.shields.io/gitlab/pipeline/Lynnesbian/fif/master?logo=gitlab&style=flat-square)
|
[![Build status](https://img.shields.io/gitlab/pipeline/Lynnesbian/fif/master?logo=gitlab&style=flat-square)
|
||||||
|
@ -187,32 +187,3 @@ a more concise overview).
|
||||||
[`xdg-mime`]: https://crates.io/crates/xdg-mime
|
[`xdg-mime`]: https://crates.io/crates/xdg-mime
|
||||||
[`infer`]: https://crates.io/crates/infer
|
[`infer`]: https://crates.io/crates/infer
|
||||||
[Shared MIME Info]: https://gitlab.freedesktop.org/xdg/shared-mime-info/
|
[Shared MIME Info]: https://gitlab.freedesktop.org/xdg/shared-mime-info/
|
||||||
|
|
||||||
|
|
||||||
## Version policy
|
|
||||||
fif adheres to the [semantic versioning](https://semver.org/) principles. While fif remains at version 0.x, the version
|
|
||||||
number will be updated as follows:
|
|
||||||
- The MAJOR version will be bumped to 1 when I believe fif to be "feature complete".
|
|
||||||
- The MINOR version will be bumped whenever I add a fairly important feature to fif (in the past, this has been bumped
|
|
||||||
when adding the ability to exclude extensions, and when fif gained the ability to output a bash script rather than a
|
|
||||||
list of invalid filenames). The MINOR version will also be bumped when increasing the MSRV.
|
|
||||||
- The PATCH version will be bumped in all other cases, including minor feature additions (in the past, this has occurred
|
|
||||||
when adding features such as more output formats and the ignore flag).
|
|
||||||
|
|
||||||
If/when fif hits version 1.0, these rules will likely remain the same as they are now.
|
|
||||||
|
|
||||||
## License
|
|
||||||
Copyright (C) 2021 Lynnesbian
|
|
||||||
|
|
||||||
This program is free software: you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation, either version 3 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License
|
|
||||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
10
clippy.sh
10
clippy.sh
|
@ -34,9 +34,6 @@ for backend in "${_backends[@]}"; do
|
||||||
-A clippy::multiple-crate-versions \
|
-A clippy::multiple-crate-versions \
|
||||||
-A clippy::cast-possible-truncation \
|
-A clippy::cast-possible-truncation \
|
||||||
-A clippy::cast-possible-wrap \
|
-A clippy::cast-possible-wrap \
|
||||||
-A clippy::must_use_candidate \
|
|
||||||
-A clippy::missing_panics_doc \
|
|
||||||
-A clippy::missing_errors_doc \
|
|
||||||
"$_extra"
|
"$_extra"
|
||||||
done
|
done
|
||||||
|
|
||||||
|
@ -46,8 +43,5 @@ done
|
||||||
# shadow_unrelated: sometimes things that seem unrelated are actually related ;)
|
# shadow_unrelated: sometimes things that seem unrelated are actually related ;)
|
||||||
# option_if_let_else: the suggested code is usually harder to read than the original
|
# option_if_let_else: the suggested code is usually harder to read than the original
|
||||||
# multiple_crate_versions: cached uses an old version of hashbrown :c
|
# multiple_crate_versions: cached uses an old version of hashbrown :c
|
||||||
# cast_possible_truncation: only ever used where it would be totally fine
|
# cast-possible-truncation: only ever used where it would be totally fine
|
||||||
# cast_possible_wrap: ditto
|
# cast-possible-wrap: ditto
|
||||||
# must_use_candidate: useless
|
|
||||||
# missing_panics_doc: the docs are just for me, fif isn't really intended to be used as a library, so this is unneeded
|
|
||||||
# missing_errors_doc: ditto
|
|
||||||
|
|
327
src/files.rs
327
src/files.rs
|
@ -1,327 +0,0 @@
|
||||||
use crate::findings::{Findings, ScanError};
|
|
||||||
use crate::mime_db::MimeDb;
|
|
||||||
use crate::parameters::ScanOpts;
|
|
||||||
use crate::{String, MIMEDB};
|
|
||||||
|
|
||||||
use std::collections::BTreeSet;
|
|
||||||
use std::fs::File;
|
|
||||||
use std::io;
|
|
||||||
use std::io::{Read, Seek, SeekFrom};
|
|
||||||
use std::path::Path;
|
|
||||||
use std::str::FromStr;
|
|
||||||
|
|
||||||
use cached::cached;
|
|
||||||
use cfg_if::cfg_if;
|
|
||||||
use log::{debug, error};
|
|
||||||
use mime::Mime;
|
|
||||||
use mime_guess::from_ext;
|
|
||||||
use walkdir::{DirEntry, WalkDir};
|
|
||||||
|
|
||||||
cfg_if! {
|
|
||||||
if #[cfg(windows)] {
|
|
||||||
/// Determines whether or not a file is hidden by checking its win32 file attributes.
|
|
||||||
pub fn is_hidden(entry: &DirEntry) -> bool {
|
|
||||||
use std::os::windows::prelude::*;
|
|
||||||
std::fs::metadata(entry.path()) // try to get metadata for file
|
|
||||||
.map_or(
|
|
||||||
false, // if getting metadata/attributes fails, assume it's not hidden
|
|
||||||
|f| f.file_attributes() & 0x2 > 0, // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants
|
|
||||||
)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
/// Determines whether or not a file is hidden by checking for a leading full stop.
|
|
||||||
pub fn is_hidden(entry: &DirEntry) -> bool {
|
|
||||||
entry
|
|
||||||
.file_name()
|
|
||||||
.to_str()
|
|
||||||
.map_or(false, |f| f.starts_with('.') && f != ".")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns `true` if a file matches the given criteria. This means checking whether the file's extension appears in
|
|
||||||
/// `exts` (if specified), potentially skipping over hidden files, and so on.
|
|
||||||
pub fn wanted_file(
|
|
||||||
entry: &DirEntry,
|
|
||||||
exts: Option<&BTreeSet<&str>>,
|
|
||||||
exclude: Option<&BTreeSet<&str>>,
|
|
||||||
scan_opts: &ScanOpts,
|
|
||||||
) -> bool {
|
|
||||||
if entry.depth() == 0 {
|
|
||||||
// the root directory should always be scanned.
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if !scan_opts.hidden && is_hidden(entry) {
|
|
||||||
// skip hidden files and directories. this check is performed first because it's very lightweight.
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if entry.file_type().is_dir() {
|
|
||||||
// always allow directories - there's no point doing file extension matching on something that isn't a file.
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(ext) = entry.path().extension() {
|
|
||||||
// file has extension - discard invalid UTF-8 and normalise it to lowercase.
|
|
||||||
let ext = ext.to_string_lossy().to_lowercase();
|
|
||||||
let ext = ext.as_str();
|
|
||||||
|
|
||||||
if scan_opts.ignore_unknown_exts && from_ext(ext).is_empty() {
|
|
||||||
// unknown extension, skip.
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(exts) = exts {
|
|
||||||
// only scan if the file has one of the specified extensions.
|
|
||||||
exts.contains(&ext)
|
|
||||||
} else {
|
|
||||||
// no extensions specified - the file should be scanned unless its extension is on the exclude list.
|
|
||||||
exclude.map_or(true, |exclude| !exclude.contains(&ext))
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// no file extension
|
|
||||||
scan_opts.extensionless
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Inspects the given entry, returning a [`Findings`] on success and a [`ScanError`] on failure.
|
|
||||||
///
|
|
||||||
/// In the event of an IO error, the returned [`ScanError`] will be of type [`ScanError::File`]. Otherwise, a
|
|
||||||
/// [`ScanError::Mime`] will be returned, meaning that the file was scanned successfully, but a mimetype could not be
|
|
||||||
/// determined.
|
|
||||||
pub fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result<Findings, ScanError> {
|
|
||||||
let path = entry.path();
|
|
||||||
// try to determine mimetype for this entry
|
|
||||||
let result = match mime_type(MIMEDB.get().unwrap(), path) {
|
|
||||||
// an error occurred while trying to read the file
|
|
||||||
Err(_) => return Err(ScanError::File(path)),
|
|
||||||
// the file was read successfully, but we were unable to determine its mimetype
|
|
||||||
Ok(None) => return Err(ScanError::Mime(path)),
|
|
||||||
// a mimetype was found!
|
|
||||||
Ok(Some(result)) => result,
|
|
||||||
};
|
|
||||||
|
|
||||||
// set of known extensions for the given mimetype
|
|
||||||
let known_exts = mime_extension_lookup(result.essence_str().into());
|
|
||||||
// file extension for this particular file
|
|
||||||
let entry_ext = path.extension();
|
|
||||||
|
|
||||||
let valid = match known_exts {
|
|
||||||
// there is a known set of extensions for this mimetype, and the file has an extension
|
|
||||||
Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_string_lossy().to_lowercase().into()),
|
|
||||||
// either this file has no extension, or there is no known set of extensions for this mimetype :(
|
|
||||||
Some(_) | None => false,
|
|
||||||
};
|
|
||||||
|
|
||||||
let path = if canonical_paths {
|
|
||||||
match std::fs::canonicalize(path) {
|
|
||||||
Ok(path) => path,
|
|
||||||
Err(_) => return Err(ScanError::File(entry.path())),
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
path.to_path_buf() // :c
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(Findings {
|
|
||||||
file: path,
|
|
||||||
valid,
|
|
||||||
mime: result,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Takes a slice of [`DirEntry`]s and calls [`scan_file`] on each one, returning the results in a vector.
|
|
||||||
pub fn scan_from_walkdir(entries: &[DirEntry], canonical_paths: bool) -> Vec<Result<Findings, ScanError>> {
|
|
||||||
cfg_if! {
|
|
||||||
if #[cfg(feature = "multi-threaded")] {
|
|
||||||
use rayon::prelude::*;
|
|
||||||
|
|
||||||
// split the entries into chunks of 32, and iterate over each chunk of entries in a separate thread
|
|
||||||
entries
|
|
||||||
.par_chunks(32)
|
|
||||||
.flat_map(|chunk| {
|
|
||||||
chunk
|
|
||||||
.iter() // iter over the chunk, which is a slice of DirEntry structs
|
|
||||||
.map(|entry| scan_file(entry, canonical_paths))
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
})
|
|
||||||
.collect()
|
|
||||||
} else {
|
|
||||||
entries.iter().map(|entry: &DirEntry| scan_file(entry, canonical_paths)).collect()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Scans a given directory with [`WalkDir`], filters with [`wanted_file`], checks for errors, and returns a vector of
|
|
||||||
/// [DirEntry]s.
|
|
||||||
pub fn scan_directory(
|
|
||||||
dirs: &Path,
|
|
||||||
exts: Option<&BTreeSet<&str>>,
|
|
||||||
exclude: Option<&BTreeSet<&str>>,
|
|
||||||
scan_opts: &ScanOpts,
|
|
||||||
) -> Option<Vec<DirEntry>> {
|
|
||||||
let stepper = WalkDir::new(dirs).follow_links(scan_opts.follow_symlinks).into_iter();
|
|
||||||
let mut probably_fatal_error = false;
|
|
||||||
let entries: Vec<DirEntry> = stepper
|
|
||||||
.filter_entry(|e| wanted_file(e, exts, exclude, scan_opts)) // filter out unwanted files
|
|
||||||
.filter_map(|e| {
|
|
||||||
if let Err(err) = &e {
|
|
||||||
debug!("uh oh spaghettio!! {:#?}", e);
|
|
||||||
// log errors to stdout, and remove them from the iterator
|
|
||||||
let path = err.path().map_or("General error".into(), Path::to_string_lossy);
|
|
||||||
|
|
||||||
if err.depth() == 0 {
|
|
||||||
// if something goes wrong while trying to read the root directory, we're probably not going to get much done
|
|
||||||
probably_fatal_error = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: is there a way to just say `map_or(x, |y| y).thing()` instead of `map_or(x.thing(), |y| y.thing())`?
|
|
||||||
// i don't care whether i'm returning a walkdir error or an io error, i just care about whether or not it
|
|
||||||
// implements ToString (which they both do). map_or doesn't work on trait objects though :(
|
|
||||||
error!(
|
|
||||||
"{}: {}",
|
|
||||||
path,
|
|
||||||
err.io_error().map_or(err.to_string(), |e| e.to_string())
|
|
||||||
);
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
e.ok()
|
|
||||||
})
|
|
||||||
// remove directories from the final list
|
|
||||||
.filter(|e| !e.file_type().is_dir())
|
|
||||||
// if fif is invoked without `-f` on a symlinked directory, it will recurse into the symlink (as desired) and ignore
|
|
||||||
// any symlinks inside the symlinked root directory. however, the root directory will still be added to `entries` as
|
|
||||||
// if it were a file to be scanned, and `scan_file` will fail to scan it, adding "Failed to read ~/whatever" to the
|
|
||||||
// output. to avoid this, we can remove all symlinks from `entries` if `-f` is not set. i know this is kind of
|
|
||||||
// confusing, but it's honestly kind of hard to explain... maybe a screenshot is better:
|
|
||||||
// https://i.imgur.com/DYG7jlB.png
|
|
||||||
// adding the symlink filter removes the line that's being pointed to in the image. 0u0
|
|
||||||
.filter(|e| scan_opts.follow_symlinks || !e.file_type().is_symlink())
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
if probably_fatal_error {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some(entries)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The number of bytes to read initially.
|
|
||||||
///
|
|
||||||
/// Rather than reading the entire file all at once into a [`BUF_SIZE`] buffer, it tends to be faster to read a small
|
|
||||||
/// chunk of the file and trying to identify that, proceeding with the larger buffer if that fails. Many file formats
|
|
||||||
/// can be identified with the first few dozen bytes, so the "happy path" will likely be taken in the majority of cases.
|
|
||||||
pub const INITIAL_BUF_SIZE: usize = 128;
|
|
||||||
|
|
||||||
/// The number of bytes to read if the file couldn't be identified from its first [`INITIAL_BUF_SIZE`] bytes.
|
|
||||||
pub const BUF_SIZE: usize = 8192;
|
|
||||||
|
|
||||||
/// Tries to identify the mimetype of a file from a given path.
|
|
||||||
pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
|
|
||||||
let mut buffer = [0; INITIAL_BUF_SIZE];
|
|
||||||
let mut file = File::open(path)?;
|
|
||||||
|
|
||||||
// read a small amount to start with
|
|
||||||
file.read(&mut buffer)?;
|
|
||||||
|
|
||||||
let r = db.get_type(&buffer).filter(|mime|
|
|
||||||
// some mime types should be investigated further, reading up to BUF_SIZE even if they've been determined already
|
|
||||||
// one such type is XML - there's many more specific types that can be determined by reading further (such as SVG)
|
|
||||||
mime != &mime::TEXT_XML
|
|
||||||
// another is ZIP - many file formats (DOCX, ODT, JAR...) are just ZIP files with particular data structures.
|
|
||||||
// determining that a file is in one of the MS office formats in particular requires looking quite far into the
|
|
||||||
// file.
|
|
||||||
&& mime != &Mime::from_str("application/zip").unwrap()
|
|
||||||
// doc/ppt/xls files are a subset of what's known as an "OLE2 compound document storage", at least according to
|
|
||||||
// shared-mime-info. if a pre-OOXML era MS office file is scanned and identified as x-ole-storage, reading further
|
|
||||||
// will allow it to be detected correctly as the appropriate filetype.
|
|
||||||
&& mime != &Mime::from_str("application/x-ole-storage").unwrap());
|
|
||||||
|
|
||||||
if r.is_some() {
|
|
||||||
return Ok(r);
|
|
||||||
}
|
|
||||||
|
|
||||||
// attempt to read up to the BUF_SIZE bytes of the file.
|
|
||||||
// we've already read the first 128 bytes into a buffer, but i can't see an obvious way to reuse them in a way that's
|
|
||||||
// faster than simply moving the seek position back to the start of the file and re-reading the whole BUF_SIZE bytes.
|
|
||||||
// for example, reading (BUF_SIZE - INITIAL_BUF_SIZE) bytes, then concatenating the original INITIAL_BUF_SIZE buffer
|
|
||||||
// with this new one would presumably be faster - but it's not. i think it's more expensive to create the iterator,
|
|
||||||
// collect the contents, etc. i'll have to look into this more. i don't at all doubt that there IS a way to do this
|
|
||||||
// efficiently, and i can think of a way in principle, but i'm not sure how to express it in a way that is both
|
|
||||||
// idiomatic/safe and fast.
|
|
||||||
let mut buffer = [0; BUF_SIZE];
|
|
||||||
file.seek(SeekFrom::Start(0))?;
|
|
||||||
file.read(&mut buffer)?;
|
|
||||||
Ok(db.get_type(&buffer))
|
|
||||||
}
|
|
||||||
|
|
||||||
cached! {
|
|
||||||
MIMEXT;
|
|
||||||
fn mime_extension_lookup(essence: String) -> Option<Vec<String>> = {
|
|
||||||
// Returns a list of known extensions for this mime type, if any.
|
|
||||||
// This function uses the [Mime]'s "essence" rather than the [Mime] itself - mime_guess::get_mime_extensions ignores
|
|
||||||
// the type suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. Passing the
|
|
||||||
// essence_str (which includes the suffix) fixes this.
|
|
||||||
// ↑ this is supposed to be a doc comment, but the cached! macro doesn't support that... i would switch to the
|
|
||||||
// proc_macro version of cached, but it has a huge number of deps :c
|
|
||||||
|
|
||||||
let essence = essence.as_str();
|
|
||||||
let mut exts = mime_guess::get_mime_extensions_str(essence);
|
|
||||||
if exts.is_none() {
|
|
||||||
// no matches :c
|
|
||||||
// mime_guess' database isn't exactly perfect... there are a lot of times where the db will return "some/x-thing"
|
|
||||||
// but mime_guess only understands "some/thing", or vice-versa.
|
|
||||||
// so, if there appear to be no extensions, try replacing "some/x-thing" with "some/thing", or "some/thing" with
|
|
||||||
// "some/x-thing".
|
|
||||||
if essence.contains("/x-") {
|
|
||||||
// replace e.g. "application/x-gzip" with "application/gzip"
|
|
||||||
exts = mime_guess::get_mime_extensions_str(&essence.replace("/x-", "/"));
|
|
||||||
} else {
|
|
||||||
// replace e.g. "video/mp2t" with "video/x-mp2t"
|
|
||||||
exts = mime_guess::get_mime_extensions_str(&essence.replace("/", "/x-"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
match exts {
|
|
||||||
Some(exts) => {
|
|
||||||
let possible_exts: Vec<String> = exts.iter().map(|e| String::from(*e)).collect();
|
|
||||||
|
|
||||||
Some(if essence == mime::IMAGE_JPEG.essence_str() {
|
|
||||||
// possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are
|
|
||||||
// far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can
|
|
||||||
// add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif.
|
|
||||||
[vec![String::from("jpg")], possible_exts].concat()
|
|
||||||
|
|
||||||
} else if essence == mime::TEXT_XML.essence_str() || essence == "application/xml" {
|
|
||||||
// a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should
|
|
||||||
// (in my opinion) be "xml".
|
|
||||||
// there's also another problem: SVG files can easily be misidentified as XML files, because they usually
|
|
||||||
// *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read
|
|
||||||
// before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg"
|
|
||||||
// as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered
|
|
||||||
// to have valid extensions.
|
|
||||||
// TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to
|
|
||||||
// "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its
|
|
||||||
// extension is classed as application/*+xml, consider it OK
|
|
||||||
[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
|
|
||||||
|
|
||||||
} else if essence == "application/msword" {
|
|
||||||
// classic office files considered harmful
|
|
||||||
vec![String::from("doc"), String::from("xls"), String::from("ppt")]
|
|
||||||
|
|
||||||
} else if essence == "application/zip" {
|
|
||||||
// neither xdg-mime nor infer seem to be able to detect office XML files properly...
|
|
||||||
[vec![String::from("zip"), String::from("docx"), String::from("xlsx"), String::from("pptx")], possible_exts].concat()
|
|
||||||
|
|
||||||
} else if essence == "application/x-ms-dos-executable" {
|
|
||||||
// both .dll and .exe files are given the same mime type... but you definitely don't want to rename one to the
|
|
||||||
// other!
|
|
||||||
[vec![String::from("dll"), String::from("exe")], possible_exts].concat()
|
|
||||||
} else {
|
|
||||||
possible_exts
|
|
||||||
})
|
|
||||||
},
|
|
||||||
None => None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -2,8 +2,8 @@ use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use mime::Mime;
|
use mime::Mime;
|
||||||
|
|
||||||
use crate::files::mime_extension_lookup;
|
use crate::inspectors::mime_extension_lookup;
|
||||||
use crate::String;
|
use crate::string_type::String;
|
||||||
|
|
||||||
#[cfg(feature = "json")]
|
#[cfg(feature = "json")]
|
||||||
use serde::{ser::SerializeStruct, Serializer};
|
use serde::{ser::SerializeStruct, Serializer};
|
||||||
|
|
|
@ -12,21 +12,17 @@ use snailquote::escape;
|
||||||
use crate::findings::ScanError;
|
use crate::findings::ScanError;
|
||||||
use crate::utils::clap_long_version;
|
use crate::utils::clap_long_version;
|
||||||
use crate::Findings;
|
use crate::Findings;
|
||||||
use itertools::{Either, Itertools};
|
use itertools::Itertools;
|
||||||
|
|
||||||
/// A macro for creating an array of `Writable`s without needing to pepper your code with `into()`s.
|
/// A macro for creating an array of `Writable`s without needing to pepper your code with `into()`s.
|
||||||
/// # Usage
|
/// # Usage
|
||||||
/// ```
|
/// ```
|
||||||
/// use crate::fif::writables;
|
/// let f = std::io::stdout();
|
||||||
/// use crate::fif::formats::{Writable, smart_write};
|
|
||||||
/// let mut f = std::io::stdout();
|
|
||||||
///
|
|
||||||
/// // Instead of...
|
/// // Instead of...
|
||||||
/// smart_write(&mut f, &["hello".into(), Writable::Newline]);
|
/// smart_write(f, &["hello".into(), Writable::Newline]);
|
||||||
/// // ...just use:
|
/// // ...just use:
|
||||||
/// smart_write(&mut f, writables!["hello", Newline]);
|
/// smart_write(f, writables!["hello", Newline]);
|
||||||
/// ```
|
/// ```
|
||||||
|
|
||||||
#[macro_export]
|
#[macro_export]
|
||||||
macro_rules! writables {
|
macro_rules! writables {
|
||||||
[$($args:tt),+] => {
|
[$($args:tt),+] => {
|
||||||
|
@ -75,7 +71,7 @@ impl<'a> From<&'a OsStr> for Writable<'a> {
|
||||||
|
|
||||||
fn generated_by() -> String { format!("Generated by fif {}", clap_long_version()) }
|
fn generated_by() -> String { format!("Generated by fif {}", clap_long_version()) }
|
||||||
|
|
||||||
pub fn smart_write<W: Write>(f: &mut W, writeables: &[Writable]) -> io::Result<()> {
|
fn smart_write<W: Write>(f: &mut W, writeables: &[Writable]) -> io::Result<()> {
|
||||||
// ehhhh
|
// ehhhh
|
||||||
for writeable in writeables {
|
for writeable in writeables {
|
||||||
match writeable {
|
match writeable {
|
||||||
|
@ -140,12 +136,12 @@ pub trait FormatSteps {
|
||||||
|
|
||||||
// sort errors so unreadable files appear before files with unknown mimetypes - ScanError impls Ord such that
|
// sort errors so unreadable files appear before files with unknown mimetypes - ScanError impls Ord such that
|
||||||
// ScanError::File > ScanError::Mime
|
// ScanError::File > ScanError::Mime
|
||||||
let errors = entries.iter().filter_map(|e| e.as_ref().err()).sorted_unstable();
|
let errors = entries.iter().filter_map(|e| e.as_ref().err()).sorted();
|
||||||
// sort files so that files with no known extension come before those with known extensions - None > Some("jpg")
|
// sort files so that files with no known extension come before those with known extensions - None > Some("jpg")
|
||||||
let findings = entries
|
let findings = entries
|
||||||
.iter()
|
.iter()
|
||||||
.filter_map(|e| e.as_ref().ok())
|
.filter_map(|e| e.as_ref().ok())
|
||||||
.sorted_unstable_by(|a, b| b.recommended_extension().cmp(&a.recommended_extension()).reverse());
|
.sorted_by(|a, b| b.recommended_extension().cmp(&a.recommended_extension()).reverse());
|
||||||
|
|
||||||
for error in errors {
|
for error in errors {
|
||||||
match error {
|
match error {
|
||||||
|
@ -344,12 +340,13 @@ impl Format for Json {
|
||||||
findings: &'a Vec<&'a Findings>,
|
findings: &'a Vec<&'a Findings>,
|
||||||
}
|
}
|
||||||
|
|
||||||
let (errors, findings) = &entries.iter().partition_map(|entry| match entry {
|
let result = serde_json::to_writer_pretty(
|
||||||
Err(e) => Either::Left(e),
|
f,
|
||||||
Ok(f) => Either::Right(f),
|
&SerdeEntries {
|
||||||
});
|
errors: &entries.iter().filter_map(|e| e.as_ref().err()).sorted().collect(),
|
||||||
|
findings: &entries.iter().filter_map(|f| f.as_ref().ok()).sorted().collect(),
|
||||||
let result = serde_json::to_writer_pretty(f, &SerdeEntries { errors, findings });
|
},
|
||||||
|
);
|
||||||
|
|
||||||
if let Err(err) = result {
|
if let Err(err) = result {
|
||||||
log::error!("Error while serialising: {}", err);
|
log::error!("Error while serialising: {}", err);
|
||||||
|
|
133
src/inspectors.rs
Normal file
133
src/inspectors.rs
Normal file
|
@ -0,0 +1,133 @@
|
||||||
|
//! Functions for getting the mime type and extension of a file.
|
||||||
|
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io;
|
||||||
|
use std::io::{Read, Seek, SeekFrom};
|
||||||
|
use std::path::Path;
|
||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use cached::cached;
|
||||||
|
use mime::Mime;
|
||||||
|
|
||||||
|
use crate::mime_db::MimeDb;
|
||||||
|
use crate::string_type::String;
|
||||||
|
|
||||||
|
/// The number of bytes to read initially.
|
||||||
|
///
|
||||||
|
/// Rather than reading the entire file all at once into a [`BUF_SIZE`] buffer, it tends to be faster to read a small
|
||||||
|
/// chunk of the file and trying to identify that, proceeding with the larger buffer if that fails. Many file formats
|
||||||
|
/// can be identified with the first few dozen bytes, so the "happy path" will likely be taken in the majority of cases.
|
||||||
|
pub const INITIAL_BUF_SIZE: usize = 128;
|
||||||
|
|
||||||
|
/// The number of bytes to read if the file couldn't be identified from its first [`INITIAL_BUF_SIZE`] bytes.
|
||||||
|
pub const BUF_SIZE: usize = 8192;
|
||||||
|
|
||||||
|
/// Tries to identify the mimetype of a file from a given path.
|
||||||
|
pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
|
||||||
|
let mut buffer = [0; INITIAL_BUF_SIZE];
|
||||||
|
let mut file = File::open(path)?;
|
||||||
|
|
||||||
|
// read a small amount to start with
|
||||||
|
file.read(&mut buffer)?;
|
||||||
|
|
||||||
|
let r = db.get_type(&buffer).filter(|mime|
|
||||||
|
// some mime types should be investigated further, reading up to BUF_SIZE even if they've been determined already
|
||||||
|
// one such type is XML - there's many more specific types that can be determined by reading further (such as SVG)
|
||||||
|
mime != &mime::TEXT_XML
|
||||||
|
// another is ZIP - many file formats (DOCX, ODT, JAR...) are just ZIP files with particular data structures.
|
||||||
|
// determining that a file is in one of the MS office formats in particular requires looking quite far into the
|
||||||
|
// file.
|
||||||
|
&& mime != &Mime::from_str("application/zip").unwrap()
|
||||||
|
// doc/ppt/xls files are a subset of what's known as an "OLE2 compound document storage", at least according to
|
||||||
|
// shared-mime-info. if a pre-OOXML era MS office file is scanned and identified as x-ole-storage, reading further
|
||||||
|
// will allow it to be detected correctly as the appropriate filetype.
|
||||||
|
&& mime != &Mime::from_str("application/x-ole-storage").unwrap());
|
||||||
|
|
||||||
|
if r.is_some() {
|
||||||
|
return Ok(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
// attempt to read up to the BUF_SIZE bytes of the file.
|
||||||
|
// we've already read the first 128 bytes into a buffer, but i can't see an obvious way to reuse them in a way that's
|
||||||
|
// faster than simply moving the seek position back to the start of the file and re-reading the whole BUF_SIZE bytes.
|
||||||
|
// for example, reading (BUF_SIZE - INITIAL_BUF_SIZE) bytes, then concatenating the original INITIAL_BUF_SIZE buffer
|
||||||
|
// with this new one would presumably be faster - but it's not. i think it's more expensive to create the iterator,
|
||||||
|
// collect the contents, etc. i'll have to look into this more. i don't at all doubt that there IS a way to do this
|
||||||
|
// efficiently, and i can think of a way in principle, but i'm not sure how to express it in a way that is both
|
||||||
|
// idiomatic/safe and fast.
|
||||||
|
let mut buffer = [0; BUF_SIZE];
|
||||||
|
file.seek(SeekFrom::Start(0))?;
|
||||||
|
file.read(&mut buffer)?;
|
||||||
|
Ok(db.get_type(&buffer))
|
||||||
|
}
|
||||||
|
|
||||||
|
cached! {
|
||||||
|
MIMEXT;
|
||||||
|
fn mime_extension_lookup(essence: String) -> Option<Vec<String>> = {
|
||||||
|
// Returns a list of known extensions for this mime type, if any.
|
||||||
|
// This function uses the [Mime]'s "essence" rather than the [Mime] itself - mime_guess::get_mime_extensions ignores
|
||||||
|
// the type suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. Passing the
|
||||||
|
// essence_str (which includes the suffix) fixes this.
|
||||||
|
// ↑ this is supposed to be a doc comment, but the cached! macro doesn't support that... i would switch to the
|
||||||
|
// proc_macro version of cached, but it has a huge number of deps :c
|
||||||
|
|
||||||
|
let essence = essence.as_str();
|
||||||
|
let mut exts = mime_guess::get_mime_extensions_str(essence);
|
||||||
|
if exts.is_none() {
|
||||||
|
// no matches :c
|
||||||
|
// mime_guess' database isn't exactly perfect... there are a lot of times where the db will return "some/x-thing"
|
||||||
|
// but mime_guess only understands "some/thing", or vice-versa.
|
||||||
|
// so, if there appear to be no extensions, try replacing "some/x-thing" with "some/thing", or "some/thing" with
|
||||||
|
// "some/x-thing".
|
||||||
|
if essence.contains("/x-") {
|
||||||
|
// replace e.g. "application/x-gzip" with "application/gzip"
|
||||||
|
exts = mime_guess::get_mime_extensions_str(&essence.replace("/x-", "/"));
|
||||||
|
} else {
|
||||||
|
// replace e.g. "video/mp2t" with "video/x-mp2t"
|
||||||
|
exts = mime_guess::get_mime_extensions_str(&essence.replace("/", "/x-"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
match exts {
|
||||||
|
Some(exts) => {
|
||||||
|
let possible_exts: Vec<String> = exts.iter().map(|e| String::from(*e)).collect();
|
||||||
|
|
||||||
|
Some(if essence == mime::IMAGE_JPEG.essence_str() {
|
||||||
|
// possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are
|
||||||
|
// far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can
|
||||||
|
// add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif.
|
||||||
|
[vec![String::from("jpg")], possible_exts].concat()
|
||||||
|
|
||||||
|
} else if essence == mime::TEXT_XML.essence_str() || essence == "application/xml" {
|
||||||
|
// a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should
|
||||||
|
// (in my opinion) be "xml".
|
||||||
|
// there's also another problem: SVG files can easily be misidentified as XML files, because they usually
|
||||||
|
// *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read
|
||||||
|
// before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg"
|
||||||
|
// as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered
|
||||||
|
// to have valid extensions.
|
||||||
|
// TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to
|
||||||
|
// "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its
|
||||||
|
// extension is classed as application/*+xml, consider it OK
|
||||||
|
[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
|
||||||
|
|
||||||
|
} else if essence == "application/msword" {
|
||||||
|
// classic office files considered harmful
|
||||||
|
vec![String::from("doc"), String::from("xls"), String::from("ppt")]
|
||||||
|
|
||||||
|
} else if essence == "application/zip" {
|
||||||
|
// neither xdg-mime nor infer seem to be able to detect office XML files properly...
|
||||||
|
[vec![String::from("zip"), String::from("docx"), String::from("xlsx"), String::from("pptx")], possible_exts].concat()
|
||||||
|
|
||||||
|
} else if essence == "application/x-ms-dos-executable" {
|
||||||
|
// both .dll and .exe files are given the same mime type... but you definitely don't want to rename one to the
|
||||||
|
// other!
|
||||||
|
[vec![String::from("dll"), String::from("exe")], possible_exts].concat()
|
||||||
|
} else {
|
||||||
|
possible_exts
|
||||||
|
})
|
||||||
|
},
|
||||||
|
None => None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
52
src/lib.rs
52
src/lib.rs
|
@ -1,52 +0,0 @@
|
||||||
#![forbid(unsafe_code)]
|
|
||||||
#![warn(trivial_casts, unused_lifetimes, unused_qualifications)]
|
|
||||||
|
|
||||||
pub mod files;
|
|
||||||
pub mod findings;
|
|
||||||
pub mod formats;
|
|
||||||
pub mod mime_db;
|
|
||||||
pub mod parameters;
|
|
||||||
pub mod utils;
|
|
||||||
|
|
||||||
use crate::findings::Findings;
|
|
||||||
use crate::mime_db::MimeDb;
|
|
||||||
|
|
||||||
use cfg_if::cfg_if;
|
|
||||||
use once_cell::sync::OnceCell;
|
|
||||||
|
|
||||||
cfg_if! {
|
|
||||||
if #[cfg(not(all(target_endian = "big", target_pointer_width = "32")))] {
|
|
||||||
// most architectures
|
|
||||||
pub use smartstring::alias::String;
|
|
||||||
} else {
|
|
||||||
// powerpc and other big endian 32-bit archs
|
|
||||||
pub use std::string::String;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
cfg_if! {
|
|
||||||
if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] {
|
|
||||||
/// A [OnceCell] holding an instance of [mime_db::MimeDb].
|
|
||||||
pub static MIMEDB: OnceCell<mime_db::InferDb> = OnceCell::new();
|
|
||||||
} else {
|
|
||||||
/// A [OnceCell] holding an instance of [mime_db::MimeDb].
|
|
||||||
pub static MIMEDB: OnceCell<mime_db::XdgDb> = OnceCell::new();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Initialises [`MIMEDB`] with a value dependent on the current backend.
|
|
||||||
pub fn init_db() {
|
|
||||||
cfg_if! {
|
|
||||||
if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] {
|
|
||||||
MIMEDB
|
|
||||||
.set(crate::mime_db::InferDb::init())
|
|
||||||
.or(Err("Failed to initialise Infer backend!"))
|
|
||||||
.unwrap();
|
|
||||||
} else {
|
|
||||||
MIMEDB
|
|
||||||
.set(crate::mime_db::XdgDb::init())
|
|
||||||
.or(Err("Failed to initialise XDG Mime backend!"))
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
241
src/main.rs
241
src/main.rs
|
@ -18,19 +18,44 @@
|
||||||
#![warn(trivial_casts, unused_lifetimes, unused_qualifications)]
|
#![warn(trivial_casts, unused_lifetimes, unused_qualifications)]
|
||||||
|
|
||||||
use std::io::{stdout, BufWriter, Write};
|
use std::io::{stdout, BufWriter, Write};
|
||||||
|
use std::path::Path;
|
||||||
use std::process::exit;
|
use std::process::exit;
|
||||||
|
|
||||||
|
use cfg_if::cfg_if;
|
||||||
use clap::Clap;
|
use clap::Clap;
|
||||||
use log::{debug, error, info, trace, warn, Level};
|
use log::{debug, error, info, trace, warn, Level};
|
||||||
|
use once_cell::sync::OnceCell;
|
||||||
|
use walkdir::{DirEntry, WalkDir};
|
||||||
|
|
||||||
use fif::files::{scan_directory, scan_from_walkdir};
|
use crate::findings::Findings;
|
||||||
use fif::formats::Format;
|
use crate::findings::ScanError;
|
||||||
use fif::parameters::OutputFormat;
|
use crate::formats::Format;
|
||||||
use fif::utils::{clap_long_version, os_name};
|
use crate::mime_db::MimeDb;
|
||||||
use fif::{formats, init_db, parameters};
|
use crate::parameters::{OutputFormat, ScanOpts};
|
||||||
|
use crate::utils::{clap_long_version, os_name};
|
||||||
|
use mime_guess::from_ext;
|
||||||
|
use std::collections::BTreeSet;
|
||||||
|
|
||||||
|
mod findings;
|
||||||
|
mod formats;
|
||||||
|
mod inspectors;
|
||||||
|
mod mime_db;
|
||||||
|
mod parameters;
|
||||||
|
pub(crate) mod string_type;
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests;
|
mod tests;
|
||||||
|
mod utils;
|
||||||
|
|
||||||
|
cfg_if! {
|
||||||
|
if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] {
|
||||||
|
/// A [OnceCell] holding an instance of [mime_db::MimeDb].
|
||||||
|
static MIMEDB: OnceCell<mime_db::InferDb> = OnceCell::new();
|
||||||
|
} else {
|
||||||
|
/// A [OnceCell] holding an instance of [mime_db::MimeDb].
|
||||||
|
static MIMEDB: OnceCell<mime_db::XdgDb> = OnceCell::new();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[doc(hidden)]
|
#[doc(hidden)]
|
||||||
#[allow(clippy::cognitive_complexity)]
|
#[allow(clippy::cognitive_complexity)]
|
||||||
|
@ -142,3 +167,209 @@ fn main() {
|
||||||
|
|
||||||
debug!("Done");
|
debug!("Done");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cfg_if! {
|
||||||
|
if #[cfg(windows)] {
|
||||||
|
/// Determines whether or not a file is hidden by checking its win32 file attributes.
|
||||||
|
fn is_hidden(entry: &DirEntry) -> bool {
|
||||||
|
use std::os::windows::prelude::*;
|
||||||
|
std::fs::metadata(entry.path()) // try to get metadata for file
|
||||||
|
.map_or(
|
||||||
|
false, // if getting metadata/attributes fails, assume it's not hidden
|
||||||
|
|f| f.file_attributes() & 0x2 > 0, // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants
|
||||||
|
)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/// Determines whether or not a file is hidden by checking for a leading full stop.
|
||||||
|
fn is_hidden(entry: &DirEntry) -> bool {
|
||||||
|
entry
|
||||||
|
.file_name()
|
||||||
|
.to_str()
|
||||||
|
.map_or(false, |f| f.starts_with('.') && f != ".")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns `true` if a file matches the given criteria. This means checking whether the file's extension appears in
|
||||||
|
/// `exts` (if specified), potentially skipping over hidden files, and so on.
|
||||||
|
fn wanted_file(
|
||||||
|
entry: &DirEntry,
|
||||||
|
exts: Option<&BTreeSet<&str>>,
|
||||||
|
exclude: Option<&BTreeSet<&str>>,
|
||||||
|
scan_opts: &ScanOpts,
|
||||||
|
) -> bool {
|
||||||
|
if entry.depth() == 0 {
|
||||||
|
// the root directory should always be scanned.
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if !scan_opts.hidden && is_hidden(entry) {
|
||||||
|
// skip hidden files and directories. this check is performed first because it's very lightweight.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if entry.file_type().is_dir() {
|
||||||
|
// always allow directories - there's no point doing file extension matching on something that isn't a file.
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(ext) = entry.path().extension() {
|
||||||
|
// file has extension - discard invalid UTF-8 and normalise it to lowercase.
|
||||||
|
let ext = ext.to_string_lossy().to_lowercase();
|
||||||
|
let ext = ext.as_str();
|
||||||
|
|
||||||
|
if scan_opts.ignore_unknown_exts && from_ext(ext).is_empty() {
|
||||||
|
// unknown extension, skip.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(exts) = exts {
|
||||||
|
// only scan if the file has one of the specified extensions.
|
||||||
|
exts.contains(&ext)
|
||||||
|
} else {
|
||||||
|
// no extensions specified - the file should be scanned unless its extension is on the exclude list.
|
||||||
|
exclude.map_or(true, |exclude| !exclude.contains(&ext))
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// no file extension
|
||||||
|
scan_opts.extensionless
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Inspects the given entry, returning a [`Findings`] on success and a [`ScanError`] on failure.
|
||||||
|
///
|
||||||
|
/// In the event of an IO error, the returned [`ScanError`] will be of type [`ScanError::File`]. Otherwise, a
|
||||||
|
/// [`ScanError::Mime`] will be returned, meaning that the file was scanned successfully, but a mimetype could not be
|
||||||
|
/// determined.
|
||||||
|
fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result<Findings, ScanError> {
|
||||||
|
let path = entry.path();
|
||||||
|
// try to determine mimetype for this entry
|
||||||
|
let result = match inspectors::mime_type(MIMEDB.get().unwrap(), path) {
|
||||||
|
// an error occurred while trying to read the file
|
||||||
|
Err(_) => return Err(ScanError::File(path)),
|
||||||
|
// the file was read successfully, but we were unable to determine its mimetype
|
||||||
|
Ok(None) => return Err(ScanError::Mime(path)),
|
||||||
|
// a mimetype was found!
|
||||||
|
Ok(Some(result)) => result,
|
||||||
|
};
|
||||||
|
|
||||||
|
// set of known extensions for the given mimetype
|
||||||
|
let known_exts = inspectors::mime_extension_lookup(result.essence_str().into());
|
||||||
|
// file extension for this particular file
|
||||||
|
let entry_ext = path.extension();
|
||||||
|
|
||||||
|
let valid = match known_exts {
|
||||||
|
// there is a known set of extensions for this mimetype, and the file has an extension
|
||||||
|
Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_string_lossy().to_lowercase().into()),
|
||||||
|
// either this file has no extension, or there is no known set of extensions for this mimetype :(
|
||||||
|
Some(_) | None => false,
|
||||||
|
};
|
||||||
|
|
||||||
|
let path = if canonical_paths {
|
||||||
|
match std::fs::canonicalize(path) {
|
||||||
|
Ok(path) => path,
|
||||||
|
Err(_) => return Err(ScanError::File(entry.path())),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
path.to_path_buf() // :c
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(Findings {
|
||||||
|
file: path,
|
||||||
|
valid,
|
||||||
|
mime: result,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Takes a slice of [`DirEntry`]s and calls [`scan_file`] on each one, returning the results in a vector.
|
||||||
|
fn scan_from_walkdir(entries: &[DirEntry], canonical_paths: bool) -> Vec<Result<Findings, ScanError>> {
|
||||||
|
cfg_if! {
|
||||||
|
if #[cfg(feature = "multi-threaded")] {
|
||||||
|
use rayon::prelude::*;
|
||||||
|
|
||||||
|
// split the entries into chunks of 32, and iterate over each chunk of entries in a separate thread
|
||||||
|
entries
|
||||||
|
.par_chunks(32)
|
||||||
|
.flat_map(|chunk| {
|
||||||
|
chunk
|
||||||
|
.iter() // iter over the chunk, which is a slice of DirEntry structs
|
||||||
|
.map(|entry| scan_file(entry, canonical_paths))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
} else {
|
||||||
|
entries.iter().map(|entry: &DirEntry| scan_file(entry, canonical_paths)).collect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Scans a given directory with [`WalkDir`], filters with [`wanted_file`], checks for errors, and returns a vector of
|
||||||
|
/// [DirEntry]s.
|
||||||
|
fn scan_directory(
|
||||||
|
dirs: &Path,
|
||||||
|
exts: Option<&BTreeSet<&str>>,
|
||||||
|
exclude: Option<&BTreeSet<&str>>,
|
||||||
|
scan_opts: &ScanOpts,
|
||||||
|
) -> Option<Vec<DirEntry>> {
|
||||||
|
let stepper = WalkDir::new(dirs).follow_links(scan_opts.follow_symlinks).into_iter();
|
||||||
|
let mut probably_fatal_error = false;
|
||||||
|
let entries: Vec<DirEntry> = stepper
|
||||||
|
.filter_entry(|e| wanted_file(e, exts, exclude, scan_opts)) // filter out unwanted files
|
||||||
|
.filter_map(|e| {
|
||||||
|
if let Err(err) = &e {
|
||||||
|
debug!("uh oh spaghettio!! {:#?}", e);
|
||||||
|
// log errors to stdout, and remove them from the iterator
|
||||||
|
let path = err.path().map_or("General error".into(), Path::to_string_lossy);
|
||||||
|
|
||||||
|
if err.depth() == 0 {
|
||||||
|
// if something goes wrong while trying to read the root directory, we're probably not going to get much done
|
||||||
|
probably_fatal_error = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: is there a way to just say `map_or(x, |y| y).thing()` instead of `map_or(x.thing(), |y| y.thing())`?
|
||||||
|
// i don't care whether i'm returning a walkdir error or an io error, i just care about whether or not it
|
||||||
|
// implements ToString (which they both do). map_or doesn't work on trait objects though :(
|
||||||
|
error!(
|
||||||
|
"{}: {}",
|
||||||
|
path,
|
||||||
|
err.io_error().map_or(err.to_string(), |e| e.to_string())
|
||||||
|
);
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
e.ok()
|
||||||
|
})
|
||||||
|
// remove directories from the final list
|
||||||
|
.filter(|e| !e.file_type().is_dir())
|
||||||
|
// if fif is invoked without `-f` on a symlinked directory, it will recurse into the symlink (as desired) and ignore
|
||||||
|
// any symlinks inside the symlinked root directory. however, the root directory will still be added to `entries` as
|
||||||
|
// if it were a file to be scanned, and `scan_file` will fail to scan it, adding "Failed to read ~/whatever" to the
|
||||||
|
// output. to avoid this, we can remove all symlinks from `entries` if `-f` is not set. i know this is kind of
|
||||||
|
// confusing, but it's honestly kind of hard to explain... maybe a screenshot is better:
|
||||||
|
// https://i.imgur.com/DYG7jlB.png
|
||||||
|
// adding the symlink filter removes the line that's being pointed to in the image. 0u0
|
||||||
|
.filter(|e| scan_opts.follow_symlinks || !e.file_type().is_symlink())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if probably_fatal_error {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(entries)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Initialises [`MIMEDB`] with a value dependent on the current backend.
|
||||||
|
fn init_db() {
|
||||||
|
cfg_if! {
|
||||||
|
if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] {
|
||||||
|
MIMEDB
|
||||||
|
.set(mime_db::InferDb::init())
|
||||||
|
.or(Err("Failed to initialise Infer backend!"))
|
||||||
|
.unwrap();
|
||||||
|
} else {
|
||||||
|
MIMEDB
|
||||||
|
.set(mime_db::XdgDb::init())
|
||||||
|
.or(Err("Failed to initialise XDG Mime backend!"))
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
//! [Clap] struct used to parse command line arguments.
|
//! [Clap] struct used to parse command line arguments.
|
||||||
|
|
||||||
|
use crate::string_type::String as StringType;
|
||||||
use crate::utils::{clap_long_version, clap_version};
|
use crate::utils::{clap_long_version, clap_version};
|
||||||
use crate::String as StringType;
|
|
||||||
use cfg_if::cfg_if;
|
use cfg_if::cfg_if;
|
||||||
use clap::{AppSettings, Clap};
|
use clap::{AppSettings, Clap};
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
|
|
11
src/string_type.rs
Normal file
11
src/string_type.rs
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
use cfg_if::cfg_if;
|
||||||
|
|
||||||
|
cfg_if! {
|
||||||
|
if #[cfg(not(all(target_endian = "big", target_pointer_width = "32")))] {
|
||||||
|
// most architectures
|
||||||
|
pub use smartstring::alias::String;
|
||||||
|
} else {
|
||||||
|
// powerpc and other big endian 32-bit archs
|
||||||
|
pub use std::string::String;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,9 +1,9 @@
|
||||||
use fif::files::{mime_extension_lookup, BUF_SIZE};
|
use crate::findings::Findings;
|
||||||
use fif::files::{scan_directory, scan_from_walkdir};
|
use crate::formats::{Format, PowerShell, Shell};
|
||||||
use fif::findings::Findings;
|
use crate::inspectors::{mime_extension_lookup, BUF_SIZE};
|
||||||
use fif::formats::{Format, PowerShell, Shell};
|
use crate::mime_db::MimeDb;
|
||||||
use fif::mime_db::MimeDb;
|
use crate::string_type::String;
|
||||||
use fif::String;
|
use crate::{scan_directory, scan_from_walkdir};
|
||||||
|
|
||||||
use crate::parameters::Parameters;
|
use crate::parameters::Parameters;
|
||||||
use clap::Clap;
|
use clap::Clap;
|
||||||
|
@ -21,12 +21,12 @@ const ZIP_BYTES: &[u8] = b"PK\x03\x04";
|
||||||
|
|
||||||
cfg_if::cfg_if! {
|
cfg_if::cfg_if! {
|
||||||
if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] {
|
if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] {
|
||||||
fn get_mime_db() -> fif::mime_db::InferDb {
|
fn get_mime_db() -> crate::mime_db::InferDb {
|
||||||
fif::mime_db::InferDb::init()
|
crate::mime_db::InferDb::init()
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
fn get_mime_db() -> fif::mime_db::XdgDb {
|
fn get_mime_db() -> crate::mime_db::XdgDb {
|
||||||
fif::mime_db::XdgDb::init()
|
crate::mime_db::XdgDb::init()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -333,7 +333,10 @@ fn identify_random_bytes() {
|
||||||
for (mime, count) in &results {
|
for (mime, count) in &results {
|
||||||
println!("{}:\t{} counts", mime, count);
|
println!("{}:\t{} counts", mime, count);
|
||||||
}
|
}
|
||||||
println!("No type found:\t{} counts", 1000 - results.values().sum::<i32>());
|
println!(
|
||||||
|
"No type found:\t{} counts",
|
||||||
|
results.values().len() as i32 - results.values().sum::<i32>()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -429,8 +432,8 @@ fn media_contains_audio_video_images() {
|
||||||
#[test]
|
#[test]
|
||||||
/// Ensure that the `writables!` macro produces the output it should.
|
/// Ensure that the `writables!` macro produces the output it should.
|
||||||
fn writables_is_correct() {
|
fn writables_is_correct() {
|
||||||
use fif::formats::Writable;
|
use crate::formats::Writable;
|
||||||
use fif::writables;
|
use crate::writables;
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&["henlo".into(), Path::new("henlo").into(), Writable::Newline,],
|
&["henlo".into(), Path::new("henlo").into(), Writable::Newline,],
|
||||||
|
@ -462,16 +465,3 @@ fn verbosity() {
|
||||||
assert_eq!(Parameters::parse_from(&["fif", flags]).default_verbosity(), level);
|
assert_eq!(Parameters::parse_from(&["fif", flags]).default_verbosity(), level);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
/// Ensures that smart strings don't deviate from std's Strings
|
|
||||||
fn validate_string_type() {
|
|
||||||
use fif::String as SmartString;
|
|
||||||
use std::string::String as StdString;
|
|
||||||
assert_eq!(SmartString::new(), StdString::new());
|
|
||||||
assert_eq!(SmartString::from("smol"), StdString::from("smol"));
|
|
||||||
assert_eq!(
|
|
||||||
SmartString::from("A long and therefore heap-allocated string"),
|
|
||||||
StdString::from("A long and therefore heap-allocated string")
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
Loading…
Reference in a new issue