fif/src/main.rs

323 lines
11 KiB
Rust

// fif - a command-line tool for detecting and optionally correcting files with incorrect extensions.
// Copyright (C) 2021 Lynnesbian
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
use std::io::{stdout, BufWriter};
use std::path::{Path, PathBuf};
use cfg_if::cfg_if;
use clap::Clap;
use log::{debug, error, info, trace, warn};
use once_cell::sync::OnceCell;
#[cfg(feature = "multi-threaded")]
use rayon::prelude::*;
use smartstring::alias::String;
use walkdir::{DirEntry, WalkDir};
use crate::findings::Findings;
use crate::formats::{Format, Script};
use crate::mime_db::MimeDb;
use crate::parameters::{OutputFormat, ScanOpts};
use crate::scan_error::ScanError;
use env_logger::Env;
use std::process::exit;
mod extension_set;
mod findings;
mod formats;
mod inspectors;
mod mime_db;
mod parameters;
mod scan_error;
#[cfg(test)]
mod tests;
cfg_if! {
if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] {
/// A [OnceCell] holding an instance of [mime_db::MimeDb].
static MIMEDB: OnceCell<mime_db::InferDb> = OnceCell::new();
/// The backend being used; either "Infer" or "XDG-Mime".
const BACKEND: &str = "Infer";
} else {
/// A [OnceCell] holding an instance of [mime_db::MimeDb].
static MIMEDB: OnceCell<mime_db::XdgDb> = OnceCell::new();
/// The backend being used; either "Infer" or "XDG-Mime".
const BACKEND: &str = "XDG-Mime";
}
}
#[doc(hidden)]
fn main() {
let args: parameters::Parameters = parameters::Parameters::parse();
let mut builder = env_logger::Builder::from_env(Env::new().filter_or("RUST_LOG", "INFO"));
builder
// .format(|buf, r| writeln!(buf, "{} - {}", r.level(), r.args()))
.format_module_path(false) // don't include module in logs, as it's not necessary
.format_timestamp(None) // don't include timestamps (unnecessary, and the feature flag is disabled anyway)
// .target(env_logger::Target::Stdout) // log to stdout rather than stderr
.init();
init_db();
debug!("Iterating directory: {:?}", args.dirs);
let extensions = args.extensions();
debug!("Checking files with extensions: {:?}", extensions);
let entries = scan_directory(&args.dirs, &extensions, &args.get_scan_opts());
if entries.is_none() {
// no need to log anything for fatal errors - fif will already have printed something obvious like
// "[ERROR] /fake/path: No such file or directory (os error 2)". we can assume that if this has happened, the dir
// given as input doesn't exist or is otherwise unreadable.
exit(exitcode::NOINPUT);
}
let entries = entries.unwrap();
if entries.is_empty() {
warn!("No files matching requested options found.");
exit(exitcode::OK);
}
trace!("Found {} items to check", entries.len());
let results: Vec<_> = scan_from_walkdir(&entries)
.into_iter()
.filter(
|result| result.is_err() || !result.as_ref().unwrap().valid,
// TODO: find a way to trace! the valid files without doing ↓
// || if result.as_ref().unwrap().valid { trace!("{:?} is fine", result.as_ref().unwrap().file); false } else { true }
)
.collect();
for result in &results {
match result {
Ok(r) => {
debug!(
"{:?} should have file extension {}",
r.file,
r.recommended_extension().unwrap_or_else(|| "???".into())
)
}
Err(f) => warn!("Error 0uo - {}", f),
}
}
if results.is_empty() {
info!("All files have valid extensions!");
exit(0);
}
match args.output_format {
OutputFormat::Script => {
let s = Script::new();
if s.write_all(&results, &mut BufWriter::new(stdout().lock())).is_err() {
error!("Failed to write to stdout.");
exit(exitcode::IOERR);
}
}
OutputFormat::Text => todo!(),
}
debug!("Done");
}
cfg_if! {
if #[cfg(windows)] {
/// Determines whether or not a file is hidden by checking its win32 file attributes.
fn is_hidden(entry: &DirEntry) -> bool {
use std::os::windows::prelude::*;
std::fs::metadata(entry.path()) // try to get metadata for file
.map_or(
false, // if getting metadata/attributes fails, assume it's not hidden
|f| f.file_attributes() & 0x2 > 0, // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants
)
}
} else {
/// Determines whether or not a file is hidden by checking for a leading full stop.
fn is_hidden(entry: &DirEntry) -> bool {
entry
.file_name()
.to_str()
.map_or(false, |f| f.starts_with('.') && f != ".")
}
}
}
/// Returns `true` if a file matches the given criteria. This means checking whether the file's extension appears in
/// `exts`, potentially skipping over hidden files, and so on.
fn wanted_file(entry: &DirEntry, exts: &[&str], scan_opts: &ScanOpts) -> bool {
if entry.depth() == 0 {
// the root directory should always be scanned.
return true;
}
if !scan_opts.hidden && is_hidden(entry) {
// skip hidden files and directories. this check is performed first because it's very lightweight.
return false;
}
if entry.file_type().is_dir() {
// always allow directories - there's no point doing file extension matching on something that isn't a file.
return true;
}
let ext = extension_from_path(entry.path());
if ext.is_none() && !scan_opts.extensionless {
// don't scan files without extensions.
return false;
}
exts.contains(&ext.unwrap().to_lowercase().as_str())
}
/// Given a file path, returns its extension, using [std::path::Path::extension].
///
/// The extension is currently [converted to a lossy string](std::ffi::OsStr::to_string_lossy), although it will
/// (eventually) in future return an OsStr instead.
// TODO: ↑
fn extension_from_path(path: &Path) -> Option<String> {
path.extension(). // Get the path's extension
map(|e| String::from(e.to_string_lossy())) // Convert from OsStr to String
}
/// Inspects the given entry, returning a [Findings] on success and a [ScanError] on failure.
///
/// In the event of an IO error, the returned ScanError will be of type [ScanError::File]. Otherwise, a
/// [ScanError::Mime] will be returned, meaning that the file was scanned successfully, but a mimetype could not be
/// determined.
fn scan_file(entry: &DirEntry) -> Result<Findings, ScanError> {
// try to determine mimetype for this entry
let result = inspectors::mime_type(MIMEDB.get().unwrap(), entry.path());
if result.is_err() {
// an error occurred while trying to read the file
// error!("{}: {}", entry.path().to_string_lossy(), error);
return Err(ScanError::File(entry.path().to_path_buf()));
}
let result = result.unwrap();
if result.is_none() {
// the file was read successfully, but we were unable to determine its mimetype
// warn!("Couldn't determine mimetype for {}", entry.path().to_string_lossy());
return Err(ScanError::Mime(entry.path().to_path_buf()));
}
let result = result.unwrap();
// set of known extensions for the given mimetype
let known_exts = inspectors::mime_extension_lookup(result.clone());
// file extension for this particular file
let entry_ext = extension_from_path(entry.path());
let valid = match known_exts {
// there is a known set of extensions for this mimetype, and the file has an extension
Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_lowercase().into()),
// either this file has no extension, or there is no known set of extensions for this mimetype :(
Some(_) | None => false,
};
Ok(Findings {
file: entry.path().to_path_buf(),
valid,
mime: result,
})
}
/// Takes a slice of [DirEntry]s and calls [scan_file] on each one, returning the results in a vector.
fn scan_from_walkdir(entries: &[DirEntry]) -> Vec<Result<Findings, ScanError>> {
cfg_if! {
if #[cfg(feature = "multi-threaded")] {
// rather than using a standard par_iter, split the entries into chunks of 32 first.
// this allows each spawned thread to handle 32 files before before closing, rather than creating a new thread for
// each file. this leads to a pretty substantial speedup that i'm pretty substantially happy about 0u0
entries
.par_chunks(32) // split into chunks of 32
.flat_map(|chunk| {
chunk // return Vec<...> instead of Chunk<Vec<...>>
.iter() // iter over the chunk, which is a slice of DirEntry structs
.map(|entry| scan_file(entry))
.collect::<Vec<_>>()
})
.collect()
} else {
entries.iter().map(|entry: &DirEntry| scan_file(entry)).collect()
}
}
}
/// Scans a given directory with [WalkDir], filters with [wanted_file], checks for errors, and returns a vector of
/// [DirEntry]s.
fn scan_directory(dirs: &PathBuf, exts: &Vec<&str>, scan_opts: &ScanOpts) -> Option<Vec<DirEntry>> {
let stepper = WalkDir::new(dirs).into_iter();
let mut probably_fatal_error = false;
let entries: Vec<DirEntry> = stepper
.filter_entry(|e| wanted_file(e, exts, scan_opts)) // filter out unwanted files
.filter_map(|e| {
if let Err(err) = &e {
debug!("uh oh spaghettio!! {:#?}", e);
// log errors to stdout, and remove them from the iterator
let path = err.path().map_or("General error".into(), Path::to_string_lossy);
if err.depth() == 0 {
// if something goes wrong while trying to read the root directory, we're probably not going to get much done
probably_fatal_error = true;
}
// TODO: is there a way to just say `map_or(x, |y| y).thing()` instead of `map_or(x.thing(), |y| y.thing())`?
// i don't care whether i'm returning a walkdir error or an io error, i just care about whether or not it
// implements ToString (which they both do). map_or doesn't work on trait objects though :(
error!(
"{}: {}",
path,
err.io_error().map_or(err.to_string(), |e| e.to_string())
);
return None;
}
e.ok()
})
.filter(|e| !e.file_type().is_dir()) // remove directories from the final list
.collect();
if probably_fatal_error {
None
} else {
Some(entries)
}
}
/// Initialises [MIMEDB] with a value dependent on the current backend.
fn init_db() {
cfg_if! {
if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] {
MIMEDB
.set(mime_db::InferDb::init())
.or(Err("Failed to initialise Infer backend!"))
.unwrap();
} else {
MIMEDB
.set(mime_db::XdgDb::init())
.or(Err("Failed to initialise XDG Mime backend!"))
.unwrap();
}
}
}