diff --git a/Cargo.lock b/Cargo.lock index cd03aa6..8ab92af 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -311,9 +311,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.100" +version = "0.2.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1fa8cddc8fbbee11227ef194b5317ed014b8acbf15139bd716a18ad3fe99ec5" +checksum = "3cb00336871be5ed2c8ed44b60ae9959dc5b9f08539422ed43f09e34ecaeba21" [[package]] name = "log" diff --git a/clippy.sh b/clippy.sh index e625bfb..b16d373 100755 --- a/clippy.sh +++ b/clippy.sh @@ -34,6 +34,9 @@ for backend in "${_backends[@]}"; do -A clippy::multiple-crate-versions \ -A clippy::cast-possible-truncation \ -A clippy::cast-possible-wrap \ + -A clippy::must_use_candidate \ + -A clippy::missing_panics_doc \ + -A clippy::missing_errors_doc \ "$_extra" done @@ -43,5 +46,8 @@ done # shadow_unrelated: sometimes things that seem unrelated are actually related ;) # option_if_let_else: the suggested code is usually harder to read than the original # multiple_crate_versions: cached uses an old version of hashbrown :c -# cast-possible-truncation: only ever used where it would be totally fine -# cast-possible-wrap: ditto +# cast_possible_truncation: only ever used where it would be totally fine +# cast_possible_wrap: ditto +# must_use_candidate: useless +# missing_panics_doc: the docs are just for me, fif isn't really intended to be used as a library, so this is unneeded +# missing_errors_doc: ditto diff --git a/src/formats.rs b/src/formats.rs index 4ea3c67..01db525 100644 --- a/src/formats.rs +++ b/src/formats.rs @@ -17,12 +17,16 @@ use itertools::{Either, Itertools}; /// A macro for creating an array of `Writable`s without needing to pepper your code with `into()`s. /// # Usage /// ``` -/// let f = std::io::stdout(); +/// use crate::fif::writables; +/// use crate::fif::formats::{Writable, smart_write}; +/// let mut f = std::io::stdout(); +/// /// // Instead of... -/// smart_write(f, &["hello".into(), Writable::Newline]); +/// smart_write(&mut f, &["hello".into(), Writable::Newline]); /// // ...just use: -/// smart_write(f, writables!["hello", Newline]); +/// smart_write(&mut f, writables!["hello", Newline]); /// ``` + #[macro_export] macro_rules! writables { [$($args:tt),+] => { @@ -71,7 +75,7 @@ impl<'a> From<&'a OsStr> for Writable<'a> { fn generated_by() -> String { format!("Generated by fif {}", clap_long_version()) } -fn smart_write(f: &mut W, writeables: &[Writable]) -> io::Result<()> { +pub fn smart_write(f: &mut W, writeables: &[Writable]) -> io::Result<()> { // ehhhh for writeable in writeables { match writeable { diff --git a/src/inspectors.rs b/src/inspectors.rs index c572653..a598181 100644 --- a/src/inspectors.rs +++ b/src/inspectors.rs @@ -9,8 +9,8 @@ use std::str::FromStr; use cached::cached; use mime::Mime; -use crate::mime_db::MimeDb; use crate::string_type::String; +use crate::MimeDb; /// The number of bytes to read initially. /// diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..853e8f3 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,237 @@ +#![forbid(unsafe_code)] +#![warn(trivial_casts, unused_lifetimes, unused_qualifications)] + +pub mod mime_db; +pub mod findings; +pub mod formats; +pub mod inspectors; +pub mod parameters; +pub mod string_type; +pub mod utils; + +use cfg_if::cfg_if; +use once_cell::sync::OnceCell; +use walkdir::{DirEntry, WalkDir}; +use std::collections::BTreeSet; +use std::path::Path; +use log::{debug, error, warn}; +use mime_guess::from_ext; +use crate::parameters::ScanOpts; +use crate::findings::{Findings, ScanError}; +use crate::mime_db::MimeDb; + +cfg_if! { + if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] { + /// A [OnceCell] holding an instance of [mime_db::MimeDb]. + pub static MIMEDB: OnceCell = OnceCell::new(); + } else { + /// A [OnceCell] holding an instance of [mime_db::MimeDb]. + pub static MIMEDB: OnceCell = OnceCell::new(); + } +} + +cfg_if! { + if #[cfg(windows)] { + /// Determines whether or not a file is hidden by checking its win32 file attributes. + pub fn is_hidden(entry: &DirEntry) -> bool { + use std::os::windows::prelude::*; + std::fs::metadata(entry.path()) // try to get metadata for file + .map_or( + false, // if getting metadata/attributes fails, assume it's not hidden + |f| f.file_attributes() & 0x2 > 0, // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants + ) + } + } else { + /// Determines whether or not a file is hidden by checking for a leading full stop. + pub fn is_hidden(entry: &DirEntry) -> bool { + entry + .file_name() + .to_str() + .map_or(false, |f| f.starts_with('.') && f != ".") + } + } +} + +/// Returns `true` if a file matches the given criteria. This means checking whether the file's extension appears in +/// `exts` (if specified), potentially skipping over hidden files, and so on. +pub fn wanted_file( + entry: &DirEntry, + exts: Option<&BTreeSet<&str>>, + exclude: Option<&BTreeSet<&str>>, + scan_opts: &ScanOpts, +) -> bool { + if entry.depth() == 0 { + // the root directory should always be scanned. + return true; + } + + if !scan_opts.hidden && is_hidden(entry) { + // skip hidden files and directories. this check is performed first because it's very lightweight. + return false; + } + + if entry.file_type().is_dir() { + // always allow directories - there's no point doing file extension matching on something that isn't a file. + return true; + } + + if let Some(ext) = entry.path().extension() { + // file has extension - discard invalid UTF-8 and normalise it to lowercase. + let ext = ext.to_string_lossy().to_lowercase(); + let ext = ext.as_str(); + + if scan_opts.ignore_unknown_exts && from_ext(ext).is_empty() { + // unknown extension, skip. + return false; + } + + if let Some(exts) = exts { + // only scan if the file has one of the specified extensions. + exts.contains(&ext) + } else { + // no extensions specified - the file should be scanned unless its extension is on the exclude list. + exclude.map_or(true, |exclude| !exclude.contains(&ext)) + } + } else { + // no file extension + scan_opts.extensionless + } +} + +/// Inspects the given entry, returning a [`Findings`] on success and a [`ScanError`] on failure. +/// +/// In the event of an IO error, the returned [`ScanError`] will be of type [`ScanError::File`]. Otherwise, a +/// [`ScanError::Mime`] will be returned, meaning that the file was scanned successfully, but a mimetype could not be +/// determined. +pub fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result { + let path = entry.path(); + // try to determine mimetype for this entry + let result = match inspectors::mime_type(MIMEDB.get().unwrap(), path) { + // an error occurred while trying to read the file + Err(_) => return Err(ScanError::File(path)), + // the file was read successfully, but we were unable to determine its mimetype + Ok(None) => return Err(ScanError::Mime(path)), + // a mimetype was found! + Ok(Some(result)) => result, + }; + + // set of known extensions for the given mimetype + let known_exts = inspectors::mime_extension_lookup(result.essence_str().into()); + // file extension for this particular file + let entry_ext = path.extension(); + + let valid = match known_exts { + // there is a known set of extensions for this mimetype, and the file has an extension + Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_string_lossy().to_lowercase().into()), + // either this file has no extension, or there is no known set of extensions for this mimetype :( + Some(_) | None => false, + }; + + let path = if canonical_paths { + match std::fs::canonicalize(path) { + Ok(path) => path, + Err(_) => return Err(ScanError::File(entry.path())), + } + } else { + path.to_path_buf() // :c + }; + + Ok(Findings { + file: path, + valid, + mime: result, + }) +} + +/// Takes a slice of [`DirEntry`]s and calls [`scan_file`] on each one, returning the results in a vector. +pub fn scan_from_walkdir(entries: &[DirEntry], canonical_paths: bool) -> Vec> { + cfg_if! { + if #[cfg(feature = "multi-threaded")] { + use rayon::prelude::*; + + // split the entries into chunks of 32, and iterate over each chunk of entries in a separate thread + entries + .par_chunks(32) + .flat_map(|chunk| { + chunk + .iter() // iter over the chunk, which is a slice of DirEntry structs + .map(|entry| scan_file(entry, canonical_paths)) + .collect::>() + }) + .collect() + } else { + entries.iter().map(|entry: &DirEntry| scan_file(entry, canonical_paths)).collect() + } + } +} + +/// Scans a given directory with [`WalkDir`], filters with [`wanted_file`], checks for errors, and returns a vector of +/// [DirEntry]s. +pub fn scan_directory( + dirs: &Path, + exts: Option<&BTreeSet<&str>>, + exclude: Option<&BTreeSet<&str>>, + scan_opts: &ScanOpts, +) -> Option> { + let stepper = WalkDir::new(dirs).follow_links(scan_opts.follow_symlinks).into_iter(); + let mut probably_fatal_error = false; + let entries: Vec = stepper + .filter_entry(|e| wanted_file(e, exts, exclude, scan_opts)) // filter out unwanted files + .filter_map(|e| { + if let Err(err) = &e { + debug!("uh oh spaghettio!! {:#?}", e); + // log errors to stdout, and remove them from the iterator + let path = err.path().map_or("General error".into(), Path::to_string_lossy); + + if err.depth() == 0 { + // if something goes wrong while trying to read the root directory, we're probably not going to get much done + probably_fatal_error = true; + } + + // TODO: is there a way to just say `map_or(x, |y| y).thing()` instead of `map_or(x.thing(), |y| y.thing())`? + // i don't care whether i'm returning a walkdir error or an io error, i just care about whether or not it + // implements ToString (which they both do). map_or doesn't work on trait objects though :( + error!( + "{}: {}", + path, + err.io_error().map_or(err.to_string(), |e| e.to_string()) + ); + return None; + } + e.ok() + }) + // remove directories from the final list + .filter(|e| !e.file_type().is_dir()) + // if fif is invoked without `-f` on a symlinked directory, it will recurse into the symlink (as desired) and ignore + // any symlinks inside the symlinked root directory. however, the root directory will still be added to `entries` as + // if it were a file to be scanned, and `scan_file` will fail to scan it, adding "Failed to read ~/whatever" to the + // output. to avoid this, we can remove all symlinks from `entries` if `-f` is not set. i know this is kind of + // confusing, but it's honestly kind of hard to explain... maybe a screenshot is better: + // https://i.imgur.com/DYG7jlB.png + // adding the symlink filter removes the line that's being pointed to in the image. 0u0 + .filter(|e| scan_opts.follow_symlinks || !e.file_type().is_symlink()) + .collect(); + + if probably_fatal_error { + None + } else { + Some(entries) + } +} + +/// Initialises [`MIMEDB`] with a value dependent on the current backend. +pub fn init_db() { + cfg_if! { + if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] { + MIMEDB + .set(crate::mime_db::InferDb::init()) + .or(Err("Failed to initialise Infer backend!")) + .unwrap(); + } else { + MIMEDB + .set(crate::mime_db::XdgDb::init()) + .or(Err("Failed to initialise XDG Mime backend!")) + .unwrap(); + } + } +} \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index bcf43a1..fbf58fa 100644 --- a/src/main.rs +++ b/src/main.rs @@ -18,44 +18,18 @@ #![warn(trivial_casts, unused_lifetimes, unused_qualifications)] use std::io::{stdout, BufWriter, Write}; -use std::path::Path; use std::process::exit; -use cfg_if::cfg_if; use clap::Clap; use log::{debug, error, info, trace, warn, Level}; -use once_cell::sync::OnceCell; -use walkdir::{DirEntry, WalkDir}; -use crate::findings::Findings; -use crate::findings::ScanError; -use crate::formats::Format; -use crate::mime_db::MimeDb; -use crate::parameters::{OutputFormat, ScanOpts}; -use crate::utils::{clap_long_version, os_name}; -use mime_guess::from_ext; -use std::collections::BTreeSet; - -mod findings; -mod formats; -mod inspectors; -mod mime_db; -mod parameters; -pub(crate) mod string_type; +use fif::formats::Format; +use fif::parameters::{OutputFormat}; +use fif::utils::{clap_long_version, os_name}; +use fif::{init_db, scan_directory, parameters, formats}; #[cfg(test)] mod tests; -mod utils; - -cfg_if! { - if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] { - /// A [OnceCell] holding an instance of [mime_db::MimeDb]. - static MIMEDB: OnceCell = OnceCell::new(); - } else { - /// A [OnceCell] holding an instance of [mime_db::MimeDb]. - static MIMEDB: OnceCell = OnceCell::new(); - } -} #[doc(hidden)] #[allow(clippy::cognitive_complexity)] @@ -115,7 +89,7 @@ fn main() { trace!("Found {} items to check", entries.len()); - let results: Vec<_> = scan_from_walkdir(&entries, args.canonical_paths) + let results: Vec<_> = fif::scan_from_walkdir(&entries, args.canonical_paths) .into_iter() .filter( |result| result.is_err() || !result.as_ref().unwrap().valid, @@ -168,208 +142,3 @@ fn main() { debug!("Done"); } -cfg_if! { - if #[cfg(windows)] { - /// Determines whether or not a file is hidden by checking its win32 file attributes. - fn is_hidden(entry: &DirEntry) -> bool { - use std::os::windows::prelude::*; - std::fs::metadata(entry.path()) // try to get metadata for file - .map_or( - false, // if getting metadata/attributes fails, assume it's not hidden - |f| f.file_attributes() & 0x2 > 0, // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants - ) - } - } else { - /// Determines whether or not a file is hidden by checking for a leading full stop. - fn is_hidden(entry: &DirEntry) -> bool { - entry - .file_name() - .to_str() - .map_or(false, |f| f.starts_with('.') && f != ".") - } - } -} - -/// Returns `true` if a file matches the given criteria. This means checking whether the file's extension appears in -/// `exts` (if specified), potentially skipping over hidden files, and so on. -fn wanted_file( - entry: &DirEntry, - exts: Option<&BTreeSet<&str>>, - exclude: Option<&BTreeSet<&str>>, - scan_opts: &ScanOpts, -) -> bool { - if entry.depth() == 0 { - // the root directory should always be scanned. - return true; - } - - if !scan_opts.hidden && is_hidden(entry) { - // skip hidden files and directories. this check is performed first because it's very lightweight. - return false; - } - - if entry.file_type().is_dir() { - // always allow directories - there's no point doing file extension matching on something that isn't a file. - return true; - } - - if let Some(ext) = entry.path().extension() { - // file has extension - discard invalid UTF-8 and normalise it to lowercase. - let ext = ext.to_string_lossy().to_lowercase(); - let ext = ext.as_str(); - - if scan_opts.ignore_unknown_exts && from_ext(ext).is_empty() { - // unknown extension, skip. - return false; - } - - if let Some(exts) = exts { - // only scan if the file has one of the specified extensions. - exts.contains(&ext) - } else { - // no extensions specified - the file should be scanned unless its extension is on the exclude list. - exclude.map_or(true, |exclude| !exclude.contains(&ext)) - } - } else { - // no file extension - scan_opts.extensionless - } -} - -/// Inspects the given entry, returning a [`Findings`] on success and a [`ScanError`] on failure. -/// -/// In the event of an IO error, the returned [`ScanError`] will be of type [`ScanError::File`]. Otherwise, a -/// [`ScanError::Mime`] will be returned, meaning that the file was scanned successfully, but a mimetype could not be -/// determined. -fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result { - let path = entry.path(); - // try to determine mimetype for this entry - let result = match inspectors::mime_type(MIMEDB.get().unwrap(), path) { - // an error occurred while trying to read the file - Err(_) => return Err(ScanError::File(path)), - // the file was read successfully, but we were unable to determine its mimetype - Ok(None) => return Err(ScanError::Mime(path)), - // a mimetype was found! - Ok(Some(result)) => result, - }; - - // set of known extensions for the given mimetype - let known_exts = inspectors::mime_extension_lookup(result.essence_str().into()); - // file extension for this particular file - let entry_ext = path.extension(); - - let valid = match known_exts { - // there is a known set of extensions for this mimetype, and the file has an extension - Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_string_lossy().to_lowercase().into()), - // either this file has no extension, or there is no known set of extensions for this mimetype :( - Some(_) | None => false, - }; - - let path = if canonical_paths { - match std::fs::canonicalize(path) { - Ok(path) => path, - Err(_) => return Err(ScanError::File(entry.path())), - } - } else { - path.to_path_buf() // :c - }; - - Ok(Findings { - file: path, - valid, - mime: result, - }) -} - -/// Takes a slice of [`DirEntry`]s and calls [`scan_file`] on each one, returning the results in a vector. -fn scan_from_walkdir(entries: &[DirEntry], canonical_paths: bool) -> Vec> { - cfg_if! { - if #[cfg(feature = "multi-threaded")] { - use rayon::prelude::*; - - // split the entries into chunks of 32, and iterate over each chunk of entries in a separate thread - entries - .par_chunks(32) - .flat_map(|chunk| { - chunk - .iter() // iter over the chunk, which is a slice of DirEntry structs - .map(|entry| scan_file(entry, canonical_paths)) - .collect::>() - }) - .collect() - } else { - entries.iter().map(|entry: &DirEntry| scan_file(entry, canonical_paths)).collect() - } - } -} - -/// Scans a given directory with [`WalkDir`], filters with [`wanted_file`], checks for errors, and returns a vector of -/// [DirEntry]s. -fn scan_directory( - dirs: &Path, - exts: Option<&BTreeSet<&str>>, - exclude: Option<&BTreeSet<&str>>, - scan_opts: &ScanOpts, -) -> Option> { - let stepper = WalkDir::new(dirs).follow_links(scan_opts.follow_symlinks).into_iter(); - let mut probably_fatal_error = false; - let entries: Vec = stepper - .filter_entry(|e| wanted_file(e, exts, exclude, scan_opts)) // filter out unwanted files - .filter_map(|e| { - if let Err(err) = &e { - debug!("uh oh spaghettio!! {:#?}", e); - // log errors to stdout, and remove them from the iterator - let path = err.path().map_or("General error".into(), Path::to_string_lossy); - - if err.depth() == 0 { - // if something goes wrong while trying to read the root directory, we're probably not going to get much done - probably_fatal_error = true; - } - - // TODO: is there a way to just say `map_or(x, |y| y).thing()` instead of `map_or(x.thing(), |y| y.thing())`? - // i don't care whether i'm returning a walkdir error or an io error, i just care about whether or not it - // implements ToString (which they both do). map_or doesn't work on trait objects though :( - error!( - "{}: {}", - path, - err.io_error().map_or(err.to_string(), |e| e.to_string()) - ); - return None; - } - e.ok() - }) - // remove directories from the final list - .filter(|e| !e.file_type().is_dir()) - // if fif is invoked without `-f` on a symlinked directory, it will recurse into the symlink (as desired) and ignore - // any symlinks inside the symlinked root directory. however, the root directory will still be added to `entries` as - // if it were a file to be scanned, and `scan_file` will fail to scan it, adding "Failed to read ~/whatever" to the - // output. to avoid this, we can remove all symlinks from `entries` if `-f` is not set. i know this is kind of - // confusing, but it's honestly kind of hard to explain... maybe a screenshot is better: - // https://i.imgur.com/DYG7jlB.png - // adding the symlink filter removes the line that's being pointed to in the image. 0u0 - .filter(|e| scan_opts.follow_symlinks || !e.file_type().is_symlink()) - .collect(); - - if probably_fatal_error { - None - } else { - Some(entries) - } -} - -/// Initialises [`MIMEDB`] with a value dependent on the current backend. -fn init_db() { - cfg_if! { - if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] { - MIMEDB - .set(mime_db::InferDb::init()) - .or(Err("Failed to initialise Infer backend!")) - .unwrap(); - } else { - MIMEDB - .set(mime_db::XdgDb::init()) - .or(Err("Failed to initialise XDG Mime backend!")) - .unwrap(); - } - } -} diff --git a/src/tests/mod.rs b/src/tests/mod.rs index afdc82c..c921c0b 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -1,9 +1,9 @@ -use crate::findings::Findings; -use crate::formats::{Format, PowerShell, Shell}; -use crate::inspectors::{mime_extension_lookup, BUF_SIZE}; -use crate::mime_db::MimeDb; -use crate::string_type::String; -use crate::{scan_directory, scan_from_walkdir}; +use fif::findings::Findings; +use fif::formats::{Format, PowerShell, Shell}; +use fif::inspectors::{mime_extension_lookup, BUF_SIZE}; +use fif::mime_db::MimeDb; +use fif::string_type::String; +use fif::{scan_directory, scan_from_walkdir}; use crate::parameters::Parameters; use clap::Clap; @@ -21,12 +21,12 @@ const ZIP_BYTES: &[u8] = b"PK\x03\x04"; cfg_if::cfg_if! { if #[cfg(any(all(unix, feature = "infer-backend"), all(not(unix), not(feature = "xdg-mime-backend"))))] { - fn get_mime_db() -> crate::mime_db::InferDb { - crate::mime_db::InferDb::init() + fn get_mime_db() -> fif::mime_db::InferDb { + fif::mime_db::InferDb::init() } } else { - fn get_mime_db() -> crate::mime_db::XdgDb { - crate::mime_db::XdgDb::init() + fn get_mime_db() -> fif::mime_db::XdgDb { + fif::mime_db::XdgDb::init() } } } @@ -335,7 +335,7 @@ fn identify_random_bytes() { } println!( "No type found:\t{} counts", - results.values().len() as i32 - results.values().sum::() + 1000 - results.values().sum::() ); } @@ -432,8 +432,8 @@ fn media_contains_audio_video_images() { #[test] /// Ensure that the `writables!` macro produces the output it should. fn writables_is_correct() { - use crate::formats::Writable; - use crate::writables; + use fif::formats::Writable; + use fif::writables; assert_eq!( &["henlo".into(), Path::new("henlo").into(), Writable::Newline,],