2021-02-14 16:20:48 +00:00
|
|
|
// fif - a command-line tool for detecting and optionally correcting files with incorrect extensions.
|
2021-02-05 05:57:21 +00:00
|
|
|
// Copyright (C) 2021 Lynnesbian
|
|
|
|
//
|
|
|
|
// This program is free software: you can redistribute it and/or modify
|
|
|
|
// it under the terms of the GNU General Public License as published by
|
|
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
|
|
// (at your option) any later version.
|
|
|
|
//
|
|
|
|
// This program is distributed in the hope that it will be useful,
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
// GNU General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU General Public License
|
|
|
|
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
|
2021-02-18 09:50:22 +00:00
|
|
|
use std::io::{stdout, BufWriter};
|
2021-02-05 12:45:51 +00:00
|
|
|
use std::path::{Path, PathBuf};
|
2021-02-14 16:20:48 +00:00
|
|
|
|
2021-02-05 09:24:08 +00:00
|
|
|
use clap::Clap;
|
2021-02-21 14:07:50 +00:00
|
|
|
use log::{debug, info, trace, warn, error};
|
2021-02-14 17:33:24 +00:00
|
|
|
use once_cell::sync::OnceCell;
|
2021-02-14 19:13:16 +00:00
|
|
|
#[cfg(feature = "multi-threaded")]
|
2021-02-18 09:48:38 +00:00
|
|
|
use rayon::prelude::*;
|
2021-02-18 09:15:59 +00:00
|
|
|
use smartstring::alias::String;
|
|
|
|
use walkdir::{DirEntry, WalkDir};
|
2021-02-14 16:20:48 +00:00
|
|
|
|
2021-02-14 17:33:24 +00:00
|
|
|
use crate::findings::Findings;
|
2021-02-14 16:20:48 +00:00
|
|
|
use crate::formats::{Format, Script};
|
2021-02-14 18:58:57 +00:00
|
|
|
use crate::mimedb::MimeDb;
|
2021-02-10 09:20:22 +00:00
|
|
|
use crate::parameters::OutputFormat;
|
|
|
|
use crate::scanerror::ScanError;
|
2021-02-21 14:07:50 +00:00
|
|
|
use std::process::exit;
|
2021-02-14 16:20:48 +00:00
|
|
|
|
2021-02-14 17:12:27 +00:00
|
|
|
mod findings;
|
2021-02-18 09:48:38 +00:00
|
|
|
mod formats;
|
|
|
|
mod inspectors;
|
2021-02-14 18:58:57 +00:00
|
|
|
mod mimedb;
|
2021-02-18 09:48:38 +00:00
|
|
|
mod parameters;
|
|
|
|
mod scanerror;
|
2021-02-18 11:43:24 +00:00
|
|
|
mod extensionset;
|
2021-02-06 03:24:13 +00:00
|
|
|
|
2021-02-14 18:58:57 +00:00
|
|
|
#[cfg(feature = "infer-backend")]
|
2021-02-18 09:48:38 +00:00
|
|
|
static MIMEDB: OnceCell<mimedb::InferDb> = OnceCell::new();
|
2021-02-14 17:33:24 +00:00
|
|
|
|
2021-02-14 18:58:57 +00:00
|
|
|
#[cfg(feature = "xdg-mime-backend")]
|
2021-02-18 09:48:38 +00:00
|
|
|
static MIMEDB: OnceCell<mimedb::XdgDb> = OnceCell::new();
|
2021-02-14 18:58:57 +00:00
|
|
|
|
|
|
|
// TODO: test if this actually works on a windows machine
|
2021-02-04 11:22:19 +00:00
|
|
|
#[cfg(windows)]
|
|
|
|
fn is_hidden(entry: &DirEntry) -> bool {
|
|
|
|
use std::os::windows::prelude::*;
|
2021-02-19 17:57:19 +00:00
|
|
|
std::fs::metadata(entry.path()) // try to get metadata for file
|
2021-02-04 11:22:19 +00:00
|
|
|
.map_or(
|
2021-02-18 09:48:38 +00:00
|
|
|
false, // if getting metadata/attributes fails, assume it's not hidden
|
2021-02-14 17:12:27 +00:00
|
|
|
|f| f.file_attributes() & 0x2 > 0, // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants
|
2021-02-04 11:22:19 +00:00
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(not(windows))]
|
|
|
|
fn is_hidden(entry: &DirEntry) -> bool {
|
2021-02-18 09:48:38 +00:00
|
|
|
entry
|
|
|
|
.file_name()
|
|
|
|
.to_str()
|
|
|
|
.map_or(false, |f| f.starts_with('.') && f != ".")
|
2021-02-04 11:22:19 +00:00
|
|
|
}
|
|
|
|
|
2021-02-18 12:11:13 +00:00
|
|
|
fn wanted_file(args: ¶meters::Parameters, exts: &[&str], entry: &DirEntry) -> bool {
|
2021-02-05 05:57:21 +00:00
|
|
|
if !args.scan_hidden && is_hidden(entry) {
|
|
|
|
// skip hidden files and directories. this check is performed first because it's very lightweight.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2021-02-04 11:22:19 +00:00
|
|
|
if entry.file_type().is_dir() {
|
2021-02-05 05:57:21 +00:00
|
|
|
// always allow directories - there's no point doing file extension matching on something that isn't a file.
|
2021-02-04 11:22:19 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2021-02-05 12:45:51 +00:00
|
|
|
let ext = extension_from_path(entry.path());
|
2021-02-04 11:22:19 +00:00
|
|
|
|
2021-02-18 09:48:38 +00:00
|
|
|
if ext.is_none() {
|
|
|
|
return false;
|
|
|
|
} // don't scan files without extensions. TODO - this should be configurable
|
2021-02-04 11:22:19 +00:00
|
|
|
|
2021-02-18 11:43:24 +00:00
|
|
|
exts.contains(&ext.unwrap().to_lowercase().as_str())
|
2021-02-04 11:22:19 +00:00
|
|
|
}
|
|
|
|
|
2021-02-05 12:45:51 +00:00
|
|
|
fn extension_from_path(path: &Path) -> Option<String> {
|
|
|
|
path.extension(). // Get the path's extension
|
|
|
|
map(|e| String::from(e.to_string_lossy())) // Convert from OsStr to String
|
|
|
|
}
|
|
|
|
|
2021-02-14 17:33:24 +00:00
|
|
|
fn scan_file(entry: &DirEntry) -> Result<Findings, (ScanError, PathBuf)> {
|
2021-02-11 15:43:47 +00:00
|
|
|
// try to determine mimetype for this entry
|
2021-02-14 17:33:24 +00:00
|
|
|
let result = inspectors::mime_type(MIMEDB.get().unwrap(), entry.path());
|
2021-02-11 15:43:47 +00:00
|
|
|
|
|
|
|
if result.is_err() {
|
|
|
|
// an error occurred while trying to read the file
|
|
|
|
// error!("{}: {}", entry.path().to_string_lossy(), error);
|
|
|
|
return Err((ScanError::File, entry.path().to_path_buf()));
|
|
|
|
}
|
|
|
|
|
|
|
|
let result = result.unwrap();
|
|
|
|
if result.is_none() {
|
|
|
|
// the file was read successfully, but we were unable to determine its mimetype
|
|
|
|
// warn!("Couldn't determine mimetype for {}", entry.path().to_string_lossy());
|
|
|
|
return Err((ScanError::Mime, entry.path().to_path_buf()));
|
|
|
|
}
|
|
|
|
|
|
|
|
let result = result.unwrap();
|
|
|
|
|
|
|
|
// set of known extensions for the given mimetype
|
|
|
|
let known_exts = inspectors::mime_extension_lookup(result.clone());
|
|
|
|
// file extension for this particular file
|
|
|
|
let entry_ext = extension_from_path(entry.path());
|
|
|
|
|
|
|
|
let valid = match known_exts {
|
|
|
|
// there is a known set of extensions for this mimetype, and the file has an extension
|
|
|
|
Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_lowercase().into()),
|
|
|
|
// there is a known set of extensions for this mimetype, but the file has no extension
|
|
|
|
Some(_) => false,
|
2021-02-21 11:30:58 +00:00
|
|
|
// there is no known set of extensions for this mimetype :(
|
|
|
|
None => false,
|
2021-02-11 15:43:47 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
Ok(Findings {
|
|
|
|
file: entry.path().to_path_buf(),
|
2021-02-21 11:30:58 +00:00
|
|
|
valid,
|
2021-02-11 15:43:47 +00:00
|
|
|
mime: result,
|
|
|
|
})
|
2021-02-06 11:51:20 +00:00
|
|
|
}
|
|
|
|
|
2021-02-14 17:33:24 +00:00
|
|
|
fn scan_from_walkdir(entries: Vec<DirEntry>) -> Vec<Result<Findings, (ScanError, PathBuf)>> {
|
2021-02-18 09:48:38 +00:00
|
|
|
#[cfg(feature = "multi-threaded")]
|
|
|
|
{
|
2021-02-10 09:20:22 +00:00
|
|
|
// rather than using a standard par_iter, split the entries into chunks of 32 first.
|
2021-02-18 09:15:59 +00:00
|
|
|
// this allows each spawned thread to handle 32 files before before closing, rather than creating a new thread for
|
2021-02-06 11:51:20 +00:00
|
|
|
// each file. this leads to a pretty substantial speedup that i'm pretty substantially happy about 0u0
|
|
|
|
entries
|
2021-02-10 09:20:22 +00:00
|
|
|
.par_chunks(32) // split into chunks of 32
|
2021-02-18 09:48:38 +00:00
|
|
|
.flat_map(|chunk| {
|
|
|
|
chunk // return Vec<...> instead of Chunk<Vec<...>>
|
|
|
|
.iter() // iter over the chunk, which is a slice of DirEntry structs
|
|
|
|
.map(|entry| scan_file(entry))
|
|
|
|
.collect::<Vec<_>>()
|
|
|
|
})
|
2021-02-06 11:51:20 +00:00
|
|
|
.collect()
|
|
|
|
}
|
|
|
|
|
2021-02-18 09:48:38 +00:00
|
|
|
#[cfg(not(feature = "multi-threaded"))]
|
|
|
|
{
|
|
|
|
entries.iter().map(|entry: &DirEntry| scan_file(entry)).collect()
|
2021-02-06 11:51:20 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-02-04 11:22:19 +00:00
|
|
|
fn main() {
|
2021-02-05 09:24:08 +00:00
|
|
|
let args = parameters::Parameters::parse();
|
2021-02-21 14:07:50 +00:00
|
|
|
|
2021-02-05 13:34:02 +00:00
|
|
|
let mut builder = env_logger::Builder::from_default_env();
|
|
|
|
builder
|
|
|
|
// .format(|buf, r| writeln!(buf, "{} - {}", r.level(), r.args()))
|
2021-02-06 03:24:13 +00:00
|
|
|
.format_module_path(false) // don't include module in logs, as it's not necessary
|
|
|
|
.format_timestamp(None) // don't include timestamps (unnecessary, and the feature flag is disabled anyway)
|
2021-02-10 09:20:22 +00:00
|
|
|
// .target(env_logger::Target::Stdout) // log to stdout rather than stderr
|
2021-02-05 13:34:02 +00:00
|
|
|
.init();
|
|
|
|
|
2021-02-14 18:58:57 +00:00
|
|
|
#[cfg(feature = "infer-backend")]
|
2021-02-18 09:48:38 +00:00
|
|
|
MIMEDB
|
|
|
|
.set(mimedb::InferDb::init())
|
2021-02-21 11:30:58 +00:00
|
|
|
.or(Err("Failed to initialise Infer backend!"))
|
2021-02-18 09:48:38 +00:00
|
|
|
.unwrap();
|
2021-02-14 18:58:57 +00:00
|
|
|
|
|
|
|
#[cfg(feature = "xdg-mime-backend")]
|
2021-02-18 09:48:38 +00:00
|
|
|
MIMEDB
|
|
|
|
.set(mimedb::XdgDb::init())
|
2021-02-21 11:30:58 +00:00
|
|
|
.or(Err("Failed to initialise XDG Mime backend!"))
|
2021-02-18 09:48:38 +00:00
|
|
|
.unwrap();
|
2021-02-14 18:58:57 +00:00
|
|
|
|
2021-02-05 13:34:02 +00:00
|
|
|
debug!("Iterating directory: {:?}", args.dirs);
|
2021-02-04 11:22:19 +00:00
|
|
|
|
2021-02-18 11:43:24 +00:00
|
|
|
let extensions: Vec<&str> = if let Some(exts) = &args.exts {
|
|
|
|
exts
|
|
|
|
.iter()
|
|
|
|
.map(|s| s.as_str())
|
|
|
|
.collect()
|
|
|
|
} else if let Some(exts) = &args.ext_set {
|
|
|
|
exts.extensions().to_vec()
|
|
|
|
} else {
|
|
|
|
unreachable!()
|
|
|
|
};
|
|
|
|
|
|
|
|
debug!("Checking files with extensions: {:?}", extensions);
|
|
|
|
|
2021-02-04 11:22:19 +00:00
|
|
|
let stepper = WalkDir::new(&args.dirs).into_iter();
|
2021-02-21 14:07:50 +00:00
|
|
|
let mut probably_fatal_error= false;
|
2021-02-05 09:15:12 +00:00
|
|
|
let entries: Vec<DirEntry> = stepper
|
2021-02-18 11:43:24 +00:00
|
|
|
.filter_entry(|e| wanted_file(&args, &extensions, e)) // filter out unwanted files
|
2021-02-21 14:07:50 +00:00
|
|
|
.filter_map(|e| {
|
|
|
|
if let Err(err) = &e {
|
|
|
|
debug!("uh oh spaghettio!! {:#?}", e);
|
|
|
|
// log errors to stdout, and remove them from the iterator
|
|
|
|
let path = err
|
|
|
|
.path()
|
|
|
|
.map_or("General error".into(), Path::to_string_lossy);
|
|
|
|
|
|
|
|
if err.depth() == 0 {
|
|
|
|
// if something goes wrong while trying to read the root directory, we're probably not going to get much done
|
|
|
|
probably_fatal_error = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: is there a way to just say `map_or(x, |y| y).thing()` instead of `map_or(x.thing(), |y| y.thing())`?
|
|
|
|
// i don't care whether i'm returning a walkdir error or an io error, i just care about whether or not it
|
|
|
|
// implements ToString (which they both do). map_or doesn't work on trait objects though :(
|
|
|
|
error!("{}: {}", path, err.io_error().map_or(err.to_string(), |e|e.to_string()));
|
|
|
|
return None
|
|
|
|
}
|
|
|
|
e.ok()
|
|
|
|
})
|
2021-02-05 12:45:51 +00:00
|
|
|
.filter(|e| !e.file_type().is_dir()) // remove directories from the final list
|
2021-02-05 09:15:12 +00:00
|
|
|
.collect();
|
2021-02-05 05:57:21 +00:00
|
|
|
|
2021-02-21 14:07:50 +00:00
|
|
|
if entries.is_empty() {
|
|
|
|
if probably_fatal_error {
|
|
|
|
// no need to log anything for fatal errors - fif will already have printed something obvious like
|
|
|
|
// "[ERROR] /fake/path: No such file or directory (os error 2)". we can assume that if this has happened, the dir
|
|
|
|
// given as input doesn't exist or is otherwise unreadable.
|
|
|
|
exit(exitcode::NOINPUT);
|
|
|
|
}
|
|
|
|
|
|
|
|
warn!("No files matching requested options found.");
|
|
|
|
exit(exitcode::DATAERR);
|
|
|
|
}
|
|
|
|
|
2021-02-05 13:49:36 +00:00
|
|
|
trace!("Found {} items to check", entries.len());
|
2021-02-05 05:57:21 +00:00
|
|
|
|
2021-02-21 14:07:50 +00:00
|
|
|
let results: Vec<_> = scan_from_walkdir(entries)
|
|
|
|
.into_iter()
|
|
|
|
.filter(|result|
|
|
|
|
result.is_err()
|
|
|
|
|| !result.as_ref().unwrap().valid
|
|
|
|
// TODO: find a way to trace! the valid files without doing ↓
|
|
|
|
// || if result.as_ref().unwrap().valid { trace!("{:?} is fine", result.as_ref().unwrap().file); false } else { true }
|
|
|
|
)
|
|
|
|
.collect();
|
2021-02-05 12:45:51 +00:00
|
|
|
|
2021-02-10 09:20:22 +00:00
|
|
|
for result in &results {
|
2021-02-05 12:45:51 +00:00
|
|
|
match result {
|
2021-02-05 13:49:36 +00:00
|
|
|
Ok(r) => {
|
2021-02-21 14:07:50 +00:00
|
|
|
info!(
|
|
|
|
"{:?} should have file extension {}",
|
|
|
|
r.file,
|
|
|
|
r.recommended_extension().unwrap_or_else(|| "???".into())
|
|
|
|
)
|
2021-02-05 13:49:36 +00:00
|
|
|
}
|
2021-02-18 09:48:38 +00:00
|
|
|
Err(f) => warn!("{:#?}: Error 0uo - {}", f.1, f.0),
|
2021-02-05 12:45:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-02-21 14:07:50 +00:00
|
|
|
if results.is_empty() { info!("All files have valid extensions!") }
|
|
|
|
|
2021-02-10 09:20:22 +00:00
|
|
|
match args.output_format {
|
2021-02-14 17:12:27 +00:00
|
|
|
OutputFormat::Script => {
|
2021-02-10 09:20:22 +00:00
|
|
|
let s = Script::new();
|
2021-02-21 14:07:50 +00:00
|
|
|
if s.write_all(
|
|
|
|
&results,
|
|
|
|
&mut BufWriter::new(stdout().lock())
|
|
|
|
).is_err() {
|
|
|
|
exit(exitcode::IOERR);
|
|
|
|
}
|
2021-02-14 17:12:27 +00:00
|
|
|
}
|
2021-02-18 09:48:38 +00:00
|
|
|
OutputFormat::Text => todo!(),
|
2021-02-10 09:20:22 +00:00
|
|
|
}
|
|
|
|
|
2021-02-05 12:45:51 +00:00
|
|
|
debug!("Done");
|
2021-02-04 11:22:19 +00:00
|
|
|
}
|