Compare commits
4 commits
67fb03821d
...
58f3b323cc
Author | SHA1 | Date | |
---|---|---|---|
58f3b323cc | |||
ecee74eb29 | |||
31aaa80701 | |||
e294f56ecf |
11 changed files with 260 additions and 190 deletions
|
@ -3,6 +3,20 @@
|
||||||
<component name="CsvFileAttributes">
|
<component name="CsvFileAttributes">
|
||||||
<option name="attributeMap">
|
<option name="attributeMap">
|
||||||
<map>
|
<map>
|
||||||
|
<entry key="/Cargo.toml">
|
||||||
|
<value>
|
||||||
|
<Attribute>
|
||||||
|
<option name="separator" value="," />
|
||||||
|
</Attribute>
|
||||||
|
</value>
|
||||||
|
</entry>
|
||||||
|
<entry key="/src/inspectors.rs">
|
||||||
|
<value>
|
||||||
|
<Attribute>
|
||||||
|
<option name="separator" value="	" />
|
||||||
|
</Attribute>
|
||||||
|
</value>
|
||||||
|
</entry>
|
||||||
<entry key="/src/main.rs">
|
<entry key="/src/main.rs">
|
||||||
<value>
|
<value>
|
||||||
<Attribute>
|
<Attribute>
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
[package]
|
[package]
|
||||||
name = "fif"
|
name = "fif"
|
||||||
description = "A command-line tool for detecting and optionally correcting files with incorrect extensions."
|
description = "A command-line tool for detecting and optionally correcting files with incorrect extensions."
|
||||||
version = "0.2.6"
|
version = "0.2.7"
|
||||||
authors = ["Lynnesbian <lynne@bune.city>"]
|
authors = ["Lynnesbian <lynne@bune.city>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
license = "GPL-3.0-or-later"
|
license = "GPL-3.0-or-later"
|
||||||
|
|
|
@ -1,32 +1,42 @@
|
||||||
|
//! Sets of extensions for use with [Parameter](crate::parameters::Parameters)'s `-E` flag.
|
||||||
use clap::Clap;
|
use clap::Clap;
|
||||||
|
|
||||||
#[derive(Clap, PartialEq, Debug)]
|
#[derive(Clap, PartialEq, Debug)]
|
||||||
pub enum ExtensionSet {
|
pub enum ExtensionSet {
|
||||||
|
/// Extensions used for image file formats, such as `png`, `jpeg`, `webp`, etc.
|
||||||
Images,
|
Images,
|
||||||
|
/// Extensions used for audio file formats, such as `mp3`, `ogg`, `flac`, etc.
|
||||||
Audio,
|
Audio,
|
||||||
|
/// Extensions used for video file formats, such as `mkv`, `mp4`, `mov`, etc.
|
||||||
Videos,
|
Videos,
|
||||||
|
/// Extensions used for media file formats. This acts as a combination of the [Images](ExtensionSet::Images),
|
||||||
|
/// [Audio](ExtensionSet::Audio) and [Videos](ExtensionSet::Videos) variants.
|
||||||
Media,
|
Media,
|
||||||
|
/// Extensions used for document file formats, such as `pdf`, `odt`, `docx`, etc.
|
||||||
Documents,
|
Documents,
|
||||||
|
/// Extensions used for archive file formats, such as `zip`, `zst`, `gz`, etc.
|
||||||
Archives,
|
Archives,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ExtensionSet {
|
impl ExtensionSet {
|
||||||
|
/// The list of known extensions for this ExtensionSet.
|
||||||
pub fn extensions(&self) -> Vec<&str> {
|
pub fn extensions(&self) -> Vec<&str> {
|
||||||
match self {
|
match self {
|
||||||
Self::Images => mime_guess::get_mime_extensions_str("image/*"),
|
Self::Images => mime_guess::get_mime_extensions_str("image/*").unwrap().to_vec(),
|
||||||
Self::Videos => mime_guess::get_mime_extensions_str("video/*"),
|
Self::Audio => mime_guess::get_mime_extensions_str("audio/*").unwrap().to_vec(),
|
||||||
Self::Audio => mime_guess::get_mime_extensions_str("audio/*"),
|
Self::Videos => mime_guess::get_mime_extensions_str("video/*").unwrap().to_vec(),
|
||||||
Self::Documents => Some(
|
Self::Media => [
|
||||||
&[
|
Self::Images.extensions(),
|
||||||
|
Self::Audio.extensions(),
|
||||||
|
Self::Videos.extensions(),
|
||||||
|
]
|
||||||
|
.concat(),
|
||||||
|
Self::Documents => vec![
|
||||||
"pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx", "csv", "tsv", "odt", "ods", "odp", "oda", "rtf", "ps",
|
"pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx", "csv", "tsv", "odt", "ods", "odp", "oda", "rtf", "ps",
|
||||||
][..],
|
],
|
||||||
),
|
|
||||||
// many compressed file types follow the name scheme "application/x.+compressed.*" - maybe this can be used
|
// many compressed file types follow the name scheme "application/x.+compressed.*" - maybe this can be used
|
||||||
// somehow to extract extensions for compressed files from mime_guess?
|
// somehow to extract extensions for compressed files from mime_guess?
|
||||||
Self::Archives => Some(&["zip", "tar", "gz", "zst", "xz", "rar", "7z", "bz", "bz2"][..]),
|
Self::Archives => vec!["zip", "tar", "gz", "zst", "xz", "rar", "7z", "bz", "bz2"],
|
||||||
_ => todo!(),
|
}
|
||||||
}
|
|
||||||
.unwrap()
|
|
||||||
.to_vec()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,9 +5,13 @@ use smartstring::alias::String;
|
||||||
|
|
||||||
use crate::inspectors::mime_extension_lookup;
|
use crate::inspectors::mime_extension_lookup;
|
||||||
|
|
||||||
|
/// Information about a scanned file.
|
||||||
pub struct Findings {
|
pub struct Findings {
|
||||||
|
/// The location of the scanned file.
|
||||||
pub file: PathBuf, // TODO: replace with Path???? <'a> and all that
|
pub file: PathBuf, // TODO: replace with Path???? <'a> and all that
|
||||||
|
/// Whether or not the file's extension is valid for its mimetype.
|
||||||
pub valid: bool,
|
pub valid: bool,
|
||||||
|
/// The file's mimetype.
|
||||||
pub mime: Mime,
|
pub mime: Mime,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
//! The various formats that [fif](crate) can output to.
|
||||||
|
|
||||||
use std::io::{self, Write};
|
use std::io::{self, Write};
|
||||||
#[cfg(unix)]
|
#[cfg(unix)]
|
||||||
use std::os::unix::ffi::OsStrExt;
|
use std::os::unix::ffi::OsStrExt;
|
||||||
|
@ -8,8 +10,10 @@ use snailquote::escape;
|
||||||
use crate::scanerror::ScanError;
|
use crate::scanerror::ScanError;
|
||||||
use crate::{Findings, BACKEND};
|
use crate::{Findings, BACKEND};
|
||||||
|
|
||||||
|
/// The current version of fif, as defined in Cargo.toml.
|
||||||
const VERSION: Option<&'static str> = option_env!("CARGO_PKG_VERSION");
|
const VERSION: Option<&'static str> = option_env!("CARGO_PKG_VERSION");
|
||||||
|
|
||||||
|
#[doc(hidden)]
|
||||||
type Entries = [Result<Findings, (ScanError, PathBuf)>];
|
type Entries = [Result<Findings, (ScanError, PathBuf)>];
|
||||||
|
|
||||||
enum Writable<'a> {
|
enum Writable<'a> {
|
||||||
|
@ -97,6 +101,7 @@ pub trait Format {
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: maybe make a batch script version for windows
|
// TODO: maybe make a batch script version for windows
|
||||||
|
/// Bourne-Shell compatible script.
|
||||||
pub struct Script {}
|
pub struct Script {}
|
||||||
|
|
||||||
impl Format for Script {
|
impl Format for Script {
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
//! Functions for getting the mime type and extension of a file.
|
||||||
|
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::io::{Read, Seek, SeekFrom};
|
use std::io::{Read, Seek, SeekFrom};
|
||||||
|
@ -10,13 +12,17 @@ use smartstring::alias::String;
|
||||||
|
|
||||||
use crate::mimedb::MimeDb;
|
use crate::mimedb::MimeDb;
|
||||||
|
|
||||||
// rather than reading once into a large buffer, it tends to be faster to first try identifying the file from a small
|
/// The number of bytes to read initially.
|
||||||
// chunk read from the top, and *then* proceeding with the large buffer. many file formats can be easily identified by
|
///
|
||||||
// the first 128 bytes. of course, not all formats can, and some (OOXML...) require reading a long ways in.
|
/// Rather than reading the entire file all at once into a [BUF_SIZE] buffer, it tends to be faster to read a small
|
||||||
|
/// chunk of the file and trying to identify that, proceeding with the larger buffer if that fails. Many file formats
|
||||||
|
/// can be identified with the first few dozen bytes, so the "happy path" will likely be taken in the majority of cases.
|
||||||
const INITIAL_BUF_SIZE: usize = 128;
|
const INITIAL_BUF_SIZE: usize = 128;
|
||||||
|
|
||||||
|
/// The number of bytes to read if the file couldn't be identified from its first [INITIAL_BUF_SIZE] bytes.
|
||||||
const BUF_SIZE: usize = 4096;
|
const BUF_SIZE: usize = 4096;
|
||||||
|
|
||||||
|
/// Tries to identify the mimetype of a file from a given path.
|
||||||
pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
|
pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
|
||||||
let mut buffer = [0; INITIAL_BUF_SIZE];
|
let mut buffer = [0; INITIAL_BUF_SIZE];
|
||||||
let mut file = File::open(path)?;
|
let mut file = File::open(path)?;
|
||||||
|
@ -59,6 +65,10 @@ cached! {
|
||||||
MIMEXT;
|
MIMEXT;
|
||||||
fn mime_extension_lookup(mime: Mime) -> Option<Vec<String>> = {
|
fn mime_extension_lookup(mime: Mime) -> Option<Vec<String>> = {
|
||||||
|
|
||||||
|
// Returns a list of known extensions for this mime type, if any.
|
||||||
|
// ↑ this is supposed to be a doc comment, but the cached! macro doesn't support that... maybe i should switch to
|
||||||
|
// the derive macro
|
||||||
|
|
||||||
// match on the mime's `essence_str` rather than the mime itself - mime_guess::get_mime_extensions ignores the type
|
// match on the mime's `essence_str` rather than the mime itself - mime_guess::get_mime_extensions ignores the type
|
||||||
// suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. passing the essence_str
|
// suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. passing the essence_str
|
||||||
// (which includes the suffix) fixes this.
|
// (which includes the suffix) fixes this.
|
||||||
|
|
343
src/main.rs
343
src/main.rs
|
@ -31,8 +31,8 @@ use crate::formats::{Format, Script};
|
||||||
use crate::mimedb::MimeDb;
|
use crate::mimedb::MimeDb;
|
||||||
use crate::parameters::OutputFormat;
|
use crate::parameters::OutputFormat;
|
||||||
use crate::scanerror::ScanError;
|
use crate::scanerror::ScanError;
|
||||||
use std::process::exit;
|
|
||||||
use env_logger::Env;
|
use env_logger::Env;
|
||||||
|
use std::process::exit;
|
||||||
|
|
||||||
mod extensionset;
|
mod extensionset;
|
||||||
mod findings;
|
mod findings;
|
||||||
|
@ -47,178 +47,23 @@ mod tests;
|
||||||
|
|
||||||
cfg_if! {
|
cfg_if! {
|
||||||
if #[cfg(any(all(not(unix), not(feature = "xdg-mime-backend")), all(unix, feature = "infer-backend")))] {
|
if #[cfg(any(all(not(unix), not(feature = "xdg-mime-backend")), all(unix, feature = "infer-backend")))] {
|
||||||
|
/// A [OnceCell] holding an instance of [mimedb::MimeDb].
|
||||||
static MIMEDB: OnceCell<mimedb::InferDb> = OnceCell::new();
|
static MIMEDB: OnceCell<mimedb::InferDb> = OnceCell::new();
|
||||||
|
/// The backend being used; either "Infer" or "XDG-Mime".
|
||||||
const BACKEND: &str = "Infer";
|
const BACKEND: &str = "Infer";
|
||||||
} else {
|
} else {
|
||||||
|
/// A [OnceCell] holding an instance of [mimedb::MimeDb].
|
||||||
static MIMEDB: OnceCell<mimedb::XdgDb> = OnceCell::new();
|
static MIMEDB: OnceCell<mimedb::XdgDb> = OnceCell::new();
|
||||||
|
/// The backend being used; either "Infer" or "XDG-Mime".
|
||||||
const BACKEND: &str = "XDG-Mime";
|
const BACKEND: &str = "XDG-Mime";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cfg_if! {
|
#[doc(hidden)]
|
||||||
if #[cfg(windows)] {
|
|
||||||
fn is_hidden(entry: &DirEntry) -> bool {
|
|
||||||
use std::os::windows::prelude::*;
|
|
||||||
std::fs::metadata(entry.path()) // try to get metadata for file
|
|
||||||
.map_or(
|
|
||||||
false, // if getting metadata/attributes fails, assume it's not hidden
|
|
||||||
|f| f.file_attributes() & 0x2 > 0, // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants
|
|
||||||
)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
fn is_hidden(entry: &DirEntry) -> bool {
|
|
||||||
entry
|
|
||||||
.file_name()
|
|
||||||
.to_str()
|
|
||||||
.map_or(false, |f| f.starts_with('.') && f != ".")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn wanted_file(scan_hidden: bool, exts: &[&str], entry: &DirEntry) -> bool {
|
|
||||||
if !scan_hidden && is_hidden(entry) {
|
|
||||||
// skip hidden files and directories. this check is performed first because it's very lightweight.
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if entry.file_type().is_dir() {
|
|
||||||
// always allow directories - there's no point doing file extension matching on something that isn't a file.
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
let ext = extension_from_path(entry.path());
|
|
||||||
|
|
||||||
if ext.is_none() {
|
|
||||||
return false;
|
|
||||||
} // don't scan files without extensions. TODO - this should be configurable
|
|
||||||
|
|
||||||
exts.contains(&ext.unwrap().to_lowercase().as_str())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn extension_from_path(path: &Path) -> Option<String> {
|
|
||||||
path.extension(). // Get the path's extension
|
|
||||||
map(|e| String::from(e.to_string_lossy())) // Convert from OsStr to String
|
|
||||||
}
|
|
||||||
|
|
||||||
fn scan_file(entry: &DirEntry) -> Result<Findings, (ScanError, PathBuf)> {
|
|
||||||
// try to determine mimetype for this entry
|
|
||||||
let result = inspectors::mime_type(MIMEDB.get().unwrap(), entry.path());
|
|
||||||
|
|
||||||
if result.is_err() {
|
|
||||||
// an error occurred while trying to read the file
|
|
||||||
// error!("{}: {}", entry.path().to_string_lossy(), error);
|
|
||||||
return Err((ScanError::File, entry.path().to_path_buf()));
|
|
||||||
}
|
|
||||||
|
|
||||||
let result = result.unwrap();
|
|
||||||
if result.is_none() {
|
|
||||||
// the file was read successfully, but we were unable to determine its mimetype
|
|
||||||
// warn!("Couldn't determine mimetype for {}", entry.path().to_string_lossy());
|
|
||||||
return Err((ScanError::Mime, entry.path().to_path_buf()));
|
|
||||||
}
|
|
||||||
|
|
||||||
let result = result.unwrap();
|
|
||||||
|
|
||||||
// set of known extensions for the given mimetype
|
|
||||||
let known_exts = inspectors::mime_extension_lookup(result.clone());
|
|
||||||
// file extension for this particular file
|
|
||||||
let entry_ext = extension_from_path(entry.path());
|
|
||||||
|
|
||||||
let valid = match known_exts {
|
|
||||||
// there is a known set of extensions for this mimetype, and the file has an extension
|
|
||||||
Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_lowercase().into()),
|
|
||||||
// either this file has no extension, or there is no known set of extensions for this mimetype :(
|
|
||||||
Some(_) | None => false,
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(Findings {
|
|
||||||
file: entry.path().to_path_buf(),
|
|
||||||
valid,
|
|
||||||
mime: result,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn scan_from_walkdir(entries: &[DirEntry]) -> Vec<Result<Findings, (ScanError, PathBuf)>> {
|
|
||||||
cfg_if! {
|
|
||||||
if #[cfg(feature = "multi-threaded")] {
|
|
||||||
// rather than using a standard par_iter, split the entries into chunks of 32 first.
|
|
||||||
// this allows each spawned thread to handle 32 files before before closing, rather than creating a new thread for
|
|
||||||
// each file. this leads to a pretty substantial speedup that i'm pretty substantially happy about 0u0
|
|
||||||
entries
|
|
||||||
.par_chunks(32) // split into chunks of 32
|
|
||||||
.flat_map(|chunk| {
|
|
||||||
chunk // return Vec<...> instead of Chunk<Vec<...>>
|
|
||||||
.iter() // iter over the chunk, which is a slice of DirEntry structs
|
|
||||||
.map(|entry| scan_file(entry))
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
})
|
|
||||||
.collect()
|
|
||||||
} else {
|
|
||||||
entries.iter().map(|entry: &DirEntry| scan_file(entry)).collect()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn scan_directory(dirs: &PathBuf, exts: &Vec<&str>, scan_hidden: bool) -> Option<Vec<DirEntry>> {
|
|
||||||
let stepper = WalkDir::new(dirs).into_iter();
|
|
||||||
let mut probably_fatal_error = false;
|
|
||||||
let entries: Vec<DirEntry> = stepper
|
|
||||||
.filter_entry(|e| wanted_file(scan_hidden, exts, e)) // filter out unwanted files
|
|
||||||
.filter_map(|e| {
|
|
||||||
if let Err(err) = &e {
|
|
||||||
debug!("uh oh spaghettio!! {:#?}", e);
|
|
||||||
// log errors to stdout, and remove them from the iterator
|
|
||||||
let path = err.path().map_or("General error".into(), Path::to_string_lossy);
|
|
||||||
|
|
||||||
if err.depth() == 0 {
|
|
||||||
// if something goes wrong while trying to read the root directory, we're probably not going to get much done
|
|
||||||
probably_fatal_error = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: is there a way to just say `map_or(x, |y| y).thing()` instead of `map_or(x.thing(), |y| y.thing())`?
|
|
||||||
// i don't care whether i'm returning a walkdir error or an io error, i just care about whether or not it
|
|
||||||
// implements ToString (which they both do). map_or doesn't work on trait objects though :(
|
|
||||||
error!(
|
|
||||||
"{}: {}",
|
|
||||||
path,
|
|
||||||
err.io_error().map_or(err.to_string(), |e| e.to_string())
|
|
||||||
);
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
e.ok()
|
|
||||||
})
|
|
||||||
.filter(|e| !e.file_type().is_dir()) // remove directories from the final list
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
if probably_fatal_error {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some(entries)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn init_db() {
|
|
||||||
cfg_if! {
|
|
||||||
if #[cfg(any(all(not(unix), not(feature = "xdg-mime-backend")), all(unix, feature = "infer-backend")))] {
|
|
||||||
MIMEDB
|
|
||||||
.set(mimedb::InferDb::init())
|
|
||||||
.or(Err("Failed to initialise Infer backend!"))
|
|
||||||
.unwrap();
|
|
||||||
} else {
|
|
||||||
MIMEDB
|
|
||||||
.set(mimedb::XdgDb::init())
|
|
||||||
.or(Err("Failed to initialise XDG Mime backend!"))
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
let args: parameters::Parameters = parameters::Parameters::parse();
|
let args: parameters::Parameters = parameters::Parameters::parse();
|
||||||
|
|
||||||
let mut builder = env_logger::Builder::from_env(
|
let mut builder = env_logger::Builder::from_env(Env::new().filter_or("RUST_LOG", "INFO"));
|
||||||
Env::new().filter_or("RUST_LOG", "INFO")
|
|
||||||
);
|
|
||||||
|
|
||||||
builder
|
builder
|
||||||
// .format(|buf, r| writeln!(buf, "{} - {}", r.level(), r.args()))
|
// .format(|buf, r| writeln!(buf, "{} - {}", r.level(), r.args()))
|
||||||
|
@ -293,3 +138,177 @@ fn main() {
|
||||||
|
|
||||||
debug!("Done");
|
debug!("Done");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cfg_if! {
|
||||||
|
if #[cfg(windows)] {
|
||||||
|
/// Determines whether or not a file is hidden by checking its win32 file attributes.
|
||||||
|
fn is_hidden(entry: &DirEntry) -> bool {
|
||||||
|
use std::os::windows::prelude::*;
|
||||||
|
std::fs::metadata(entry.path()) // try to get metadata for file
|
||||||
|
.map_or(
|
||||||
|
false, // if getting metadata/attributes fails, assume it's not hidden
|
||||||
|
|f| f.file_attributes() & 0x2 > 0, // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants
|
||||||
|
)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/// Determines whether or not a file is hidden by checking for a leading full stop.
|
||||||
|
fn is_hidden(entry: &DirEntry) -> bool {
|
||||||
|
entry
|
||||||
|
.file_name()
|
||||||
|
.to_str()
|
||||||
|
.map_or(false, |f| f.starts_with('.') && f != ".")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns `true` if a file matches the given criteria.
|
||||||
|
fn wanted_file(scan_hidden: bool, exts: &[&str], entry: &DirEntry) -> bool {
|
||||||
|
if !scan_hidden && is_hidden(entry) {
|
||||||
|
// skip hidden files and directories. this check is performed first because it's very lightweight.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if entry.file_type().is_dir() {
|
||||||
|
// always allow directories - there's no point doing file extension matching on something that isn't a file.
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
let ext = extension_from_path(entry.path());
|
||||||
|
|
||||||
|
if ext.is_none() {
|
||||||
|
return false;
|
||||||
|
} // don't scan files without extensions. TODO - this should be configurable
|
||||||
|
|
||||||
|
exts.contains(&ext.unwrap().to_lowercase().as_str())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Given a file path, returns its extension, using [std::path::Path::extension].
|
||||||
|
///
|
||||||
|
/// The extension is currently [converted to a lossy string](std::ffi::OsStr::to_string_lossy), although it will
|
||||||
|
/// (eventually) in future return an OsStr instead.
|
||||||
|
fn extension_from_path(path: &Path) -> Option<String> {
|
||||||
|
path.extension(). // Get the path's extension
|
||||||
|
map(|e| String::from(e.to_string_lossy())) // Convert from OsStr to String
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Inspects the given entry, returning a [Findings] on success and a tuple of [ScanError] and [PathBuf] on failure.
|
||||||
|
///
|
||||||
|
/// In the event of an IO error, the returned ScanError will be of type [ScanError::File]. Otherwise, a
|
||||||
|
/// [ScanError::Mime] will be returned, meaning that the file was scanned successfully, but a mimetype could not be
|
||||||
|
/// determined.
|
||||||
|
fn scan_file(entry: &DirEntry) -> Result<Findings, (ScanError, PathBuf)> {
|
||||||
|
// try to determine mimetype for this entry
|
||||||
|
let result = inspectors::mime_type(MIMEDB.get().unwrap(), entry.path());
|
||||||
|
|
||||||
|
if result.is_err() {
|
||||||
|
// an error occurred while trying to read the file
|
||||||
|
// error!("{}: {}", entry.path().to_string_lossy(), error);
|
||||||
|
return Err((ScanError::File, entry.path().to_path_buf()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let result = result.unwrap();
|
||||||
|
if result.is_none() {
|
||||||
|
// the file was read successfully, but we were unable to determine its mimetype
|
||||||
|
// warn!("Couldn't determine mimetype for {}", entry.path().to_string_lossy());
|
||||||
|
return Err((ScanError::Mime, entry.path().to_path_buf()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let result = result.unwrap();
|
||||||
|
|
||||||
|
// set of known extensions for the given mimetype
|
||||||
|
let known_exts = inspectors::mime_extension_lookup(result.clone());
|
||||||
|
// file extension for this particular file
|
||||||
|
let entry_ext = extension_from_path(entry.path());
|
||||||
|
|
||||||
|
let valid = match known_exts {
|
||||||
|
// there is a known set of extensions for this mimetype, and the file has an extension
|
||||||
|
Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_lowercase().into()),
|
||||||
|
// either this file has no extension, or there is no known set of extensions for this mimetype :(
|
||||||
|
Some(_) | None => false,
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(Findings {
|
||||||
|
file: entry.path().to_path_buf(),
|
||||||
|
valid,
|
||||||
|
mime: result,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Takes a slice of [DirEntry]s and calls [scan_file] on each one, returning the results in a vector.
|
||||||
|
fn scan_from_walkdir(entries: &[DirEntry]) -> Vec<Result<Findings, (ScanError, PathBuf)>> {
|
||||||
|
cfg_if! {
|
||||||
|
if #[cfg(feature = "multi-threaded")] {
|
||||||
|
// rather than using a standard par_iter, split the entries into chunks of 32 first.
|
||||||
|
// this allows each spawned thread to handle 32 files before before closing, rather than creating a new thread for
|
||||||
|
// each file. this leads to a pretty substantial speedup that i'm pretty substantially happy about 0u0
|
||||||
|
entries
|
||||||
|
.par_chunks(32) // split into chunks of 32
|
||||||
|
.flat_map(|chunk| {
|
||||||
|
chunk // return Vec<...> instead of Chunk<Vec<...>>
|
||||||
|
.iter() // iter over the chunk, which is a slice of DirEntry structs
|
||||||
|
.map(|entry| scan_file(entry))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
} else {
|
||||||
|
entries.iter().map(|entry: &DirEntry| scan_file(entry)).collect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Scans a given directory with [WalkDir], filters with [wanted_file], checks for errors, and returns a vector of
|
||||||
|
/// [DirEntry]s.
|
||||||
|
fn scan_directory(dirs: &PathBuf, exts: &Vec<&str>, scan_hidden: bool) -> Option<Vec<DirEntry>> {
|
||||||
|
let stepper = WalkDir::new(dirs).into_iter();
|
||||||
|
let mut probably_fatal_error = false;
|
||||||
|
let entries: Vec<DirEntry> = stepper
|
||||||
|
.filter_entry(|e| wanted_file(scan_hidden, exts, e)) // filter out unwanted files
|
||||||
|
.filter_map(|e| {
|
||||||
|
if let Err(err) = &e {
|
||||||
|
debug!("uh oh spaghettio!! {:#?}", e);
|
||||||
|
// log errors to stdout, and remove them from the iterator
|
||||||
|
let path = err.path().map_or("General error".into(), Path::to_string_lossy);
|
||||||
|
|
||||||
|
if err.depth() == 0 {
|
||||||
|
// if something goes wrong while trying to read the root directory, we're probably not going to get much done
|
||||||
|
probably_fatal_error = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: is there a way to just say `map_or(x, |y| y).thing()` instead of `map_or(x.thing(), |y| y.thing())`?
|
||||||
|
// i don't care whether i'm returning a walkdir error or an io error, i just care about whether or not it
|
||||||
|
// implements ToString (which they both do). map_or doesn't work on trait objects though :(
|
||||||
|
error!(
|
||||||
|
"{}: {}",
|
||||||
|
path,
|
||||||
|
err.io_error().map_or(err.to_string(), |e| e.to_string())
|
||||||
|
);
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
e.ok()
|
||||||
|
})
|
||||||
|
.filter(|e| !e.file_type().is_dir()) // remove directories from the final list
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if probably_fatal_error {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(entries)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Initialises [MIMEDB] with a value dependent on the current backend.
|
||||||
|
fn init_db() {
|
||||||
|
cfg_if! {
|
||||||
|
if #[cfg(any(all(not(unix), not(feature = "xdg-mime-backend")), all(unix, feature = "infer-backend")))] {
|
||||||
|
MIMEDB
|
||||||
|
.set(mimedb::InferDb::init())
|
||||||
|
.or(Err("Failed to initialise Infer backend!"))
|
||||||
|
.unwrap();
|
||||||
|
} else {
|
||||||
|
MIMEDB
|
||||||
|
.set(mimedb::XdgDb::init())
|
||||||
|
.or(Err("Failed to initialise XDG Mime backend!"))
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
//! Backend-neutral Mime database implementation.
|
||||||
|
|
||||||
use cfg_if::cfg_if;
|
use cfg_if::cfg_if;
|
||||||
use mime_guess::Mime;
|
use mime_guess::Mime;
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
|
//! [Clap] struct used to parse command line arguments.
|
||||||
|
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
use crate::extensionset::ExtensionSet;
|
use crate::extensionset::ExtensionSet;
|
||||||
use clap::Clap;
|
use clap::{Clap};
|
||||||
use smartstring::{LazyCompact, SmartString};
|
use smartstring::{LazyCompact, SmartString};
|
||||||
|
|
||||||
#[derive(Clap, PartialEq, Debug)]
|
#[derive(Clap, PartialEq, Debug)]
|
||||||
|
@ -10,6 +12,8 @@ pub enum OutputFormat {
|
||||||
Text,
|
Text,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: convert this to macro style?: https://docs.rs/clap/3.0.0-beta.2/clap/index.html#using-macros
|
||||||
|
|
||||||
#[derive(Clap, Debug)]
|
#[derive(Clap, Debug)]
|
||||||
#[clap(version = option_env!("CARGO_PKG_VERSION").unwrap_or("???"))]
|
#[clap(version = option_env!("CARGO_PKG_VERSION").unwrap_or("???"))]
|
||||||
pub struct Parameters {
|
pub struct Parameters {
|
||||||
|
|
|
@ -2,7 +2,9 @@ use std::fmt::{Display, Formatter, Result};
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum ScanError {
|
pub enum ScanError {
|
||||||
|
/// Something went wrong while trying to read the given file.
|
||||||
File,
|
File,
|
||||||
|
/// Failed to determine the mimetype of the given file.
|
||||||
Mime,
|
Mime,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,13 +2,13 @@ use crate::inspectors::mime_extension_lookup;
|
||||||
use crate::mimedb::*;
|
use crate::mimedb::*;
|
||||||
use crate::{extension_from_path, init_db, scan_directory, scan_from_walkdir};
|
use crate::{extension_from_path, init_db, scan_directory, scan_from_walkdir};
|
||||||
|
|
||||||
|
use crate::parameters::Parameters;
|
||||||
use cfg_if::cfg_if;
|
use cfg_if::cfg_if;
|
||||||
use mime_guess::mime::{APPLICATION_OCTET_STREAM, APPLICATION_PDF, IMAGE_JPEG, IMAGE_PNG};
|
use mime_guess::mime::{APPLICATION_OCTET_STREAM, APPLICATION_PDF, IMAGE_JPEG, IMAGE_PNG};
|
||||||
use mime_guess::Mime;
|
use mime_guess::Mime;
|
||||||
use smartstring::alias::String;
|
use smartstring::alias::String;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use crate::parameters::Parameters;
|
|
||||||
|
|
||||||
const JPEG_BYTES: &[u8] = b"\xFF\xD8\xFF";
|
const JPEG_BYTES: &[u8] = b"\xFF\xD8\xFF";
|
||||||
const PNG_BYTES: &[u8] = b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A";
|
const PNG_BYTES: &[u8] = b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A";
|
||||||
|
@ -118,16 +118,16 @@ fn simple_directory() {
|
||||||
// 2. ensure mime type detected is IMAGE_PNG
|
// 2. ensure mime type detected is IMAGE_PNG
|
||||||
assert_eq!(result.mime, IMAGE_PNG);
|
assert_eq!(result.mime, IMAGE_PNG);
|
||||||
// 3. ensure recommended extension is in the list of known extensions for PNG files
|
// 3. ensure recommended extension is in the list of known extensions for PNG files
|
||||||
assert!(mime_extension_lookup(IMAGE_PNG).unwrap().contains(&result.recommended_extension().unwrap()));
|
assert!(mime_extension_lookup(IMAGE_PNG)
|
||||||
|
.unwrap()
|
||||||
|
.contains(&result.recommended_extension().unwrap()));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if the recommended extension for this file is in the list of known extensions for its mimetype
|
// check if the recommended extension for this file is in the list of known extensions for its mimetype
|
||||||
assert!(
|
assert!(mime_extension_lookup(result.mime.clone())
|
||||||
mime_extension_lookup(result.mime.clone())
|
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.contains(&result.recommended_extension().unwrap())
|
.contains(&result.recommended_extension().unwrap()));
|
||||||
);
|
|
||||||
|
|
||||||
// make sure the guessed mimetype is correct based on the extension of the scanned file
|
// make sure the guessed mimetype is correct based on the extension of the scanned file
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
|
Loading…
Reference in a new issue