From 557f5132ff107bb45b73111c86e072079fe5c59c Mon Sep 17 00:00:00 2001 From: Lynnesbian Date: Sat, 28 Aug 2021 17:58:30 +1000 Subject: [PATCH] new files module --- src/files.rs | 198 +++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 198 +---------------------------------------------- src/main.rs | 5 +- src/tests/mod.rs | 2 +- 4 files changed, 204 insertions(+), 199 deletions(-) create mode 100644 src/files.rs diff --git a/src/files.rs b/src/files.rs new file mode 100644 index 0000000..b409076 --- /dev/null +++ b/src/files.rs @@ -0,0 +1,198 @@ +use log::{debug, error}; +use cfg_if::cfg_if; +use walkdir::{DirEntry, WalkDir}; +use std::collections::BTreeSet; +use crate::parameters::ScanOpts; +use mime_guess::from_ext; +use crate::findings::{Findings, ScanError}; +use crate::{inspectors, MIMEDB}; +use std::path::Path; + +cfg_if! { + if #[cfg(windows)] { + /// Determines whether or not a file is hidden by checking its win32 file attributes. + pub fn is_hidden(entry: &DirEntry) -> bool { + use std::os::windows::prelude::*; + std::fs::metadata(entry.path()) // try to get metadata for file + .map_or( + false, // if getting metadata/attributes fails, assume it's not hidden + |f| f.file_attributes() & 0x2 > 0, // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants + ) + } + } else { + /// Determines whether or not a file is hidden by checking for a leading full stop. + pub fn is_hidden(entry: &DirEntry) -> bool { + entry + .file_name() + .to_str() + .map_or(false, |f| f.starts_with('.') && f != ".") + } + } +} + +/// Returns `true` if a file matches the given criteria. This means checking whether the file's extension appears in +/// `exts` (if specified), potentially skipping over hidden files, and so on. +pub fn wanted_file( + entry: &DirEntry, + exts: Option<&BTreeSet<&str>>, + exclude: Option<&BTreeSet<&str>>, + scan_opts: &ScanOpts, +) -> bool { + if entry.depth() == 0 { + // the root directory should always be scanned. + return true; + } + + if !scan_opts.hidden && is_hidden(entry) { + // skip hidden files and directories. this check is performed first because it's very lightweight. + return false; + } + + if entry.file_type().is_dir() { + // always allow directories - there's no point doing file extension matching on something that isn't a file. + return true; + } + + if let Some(ext) = entry.path().extension() { + // file has extension - discard invalid UTF-8 and normalise it to lowercase. + let ext = ext.to_string_lossy().to_lowercase(); + let ext = ext.as_str(); + + if scan_opts.ignore_unknown_exts && from_ext(ext).is_empty() { + // unknown extension, skip. + return false; + } + + if let Some(exts) = exts { + // only scan if the file has one of the specified extensions. + exts.contains(&ext) + } else { + // no extensions specified - the file should be scanned unless its extension is on the exclude list. + exclude.map_or(true, |exclude| !exclude.contains(&ext)) + } + } else { + // no file extension + scan_opts.extensionless + } +} + +/// Inspects the given entry, returning a [`Findings`] on success and a [`ScanError`] on failure. +/// +/// In the event of an IO error, the returned [`ScanError`] will be of type [`ScanError::File`]. Otherwise, a +/// [`ScanError::Mime`] will be returned, meaning that the file was scanned successfully, but a mimetype could not be +/// determined. +pub fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result { + let path = entry.path(); + // try to determine mimetype for this entry + let result = match inspectors::mime_type(MIMEDB.get().unwrap(), path) { + // an error occurred while trying to read the file + Err(_) => return Err(ScanError::File(path)), + // the file was read successfully, but we were unable to determine its mimetype + Ok(None) => return Err(ScanError::Mime(path)), + // a mimetype was found! + Ok(Some(result)) => result, + }; + + // set of known extensions for the given mimetype + let known_exts = inspectors::mime_extension_lookup(result.essence_str().into()); + // file extension for this particular file + let entry_ext = path.extension(); + + let valid = match known_exts { + // there is a known set of extensions for this mimetype, and the file has an extension + Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_string_lossy().to_lowercase().into()), + // either this file has no extension, or there is no known set of extensions for this mimetype :( + Some(_) | None => false, + }; + + let path = if canonical_paths { + match std::fs::canonicalize(path) { + Ok(path) => path, + Err(_) => return Err(ScanError::File(entry.path())), + } + } else { + path.to_path_buf() // :c + }; + + Ok(Findings { + file: path, + valid, + mime: result, + }) +} + +/// Takes a slice of [`DirEntry`]s and calls [`scan_file`] on each one, returning the results in a vector. +pub fn scan_from_walkdir(entries: &[DirEntry], canonical_paths: bool) -> Vec> { + cfg_if! { + if #[cfg(feature = "multi-threaded")] { + use rayon::prelude::*; + + // split the entries into chunks of 32, and iterate over each chunk of entries in a separate thread + entries + .par_chunks(32) + .flat_map(|chunk| { + chunk + .iter() // iter over the chunk, which is a slice of DirEntry structs + .map(|entry| scan_file(entry, canonical_paths)) + .collect::>() + }) + .collect() + } else { + entries.iter().map(|entry: &DirEntry| scan_file(entry, canonical_paths)).collect() + } + } +} + +/// Scans a given directory with [`WalkDir`], filters with [`wanted_file`], checks for errors, and returns a vector of +/// [DirEntry]s. +pub fn scan_directory( + dirs: &Path, + exts: Option<&BTreeSet<&str>>, + exclude: Option<&BTreeSet<&str>>, + scan_opts: &ScanOpts, +) -> Option> { + let stepper = WalkDir::new(dirs).follow_links(scan_opts.follow_symlinks).into_iter(); + let mut probably_fatal_error = false; + let entries: Vec = stepper + .filter_entry(|e| wanted_file(e, exts, exclude, scan_opts)) // filter out unwanted files + .filter_map(|e| { + if let Err(err) = &e { + debug!("uh oh spaghettio!! {:#?}", e); + // log errors to stdout, and remove them from the iterator + let path = err.path().map_or("General error".into(), Path::to_string_lossy); + + if err.depth() == 0 { + // if something goes wrong while trying to read the root directory, we're probably not going to get much done + probably_fatal_error = true; + } + + // TODO: is there a way to just say `map_or(x, |y| y).thing()` instead of `map_or(x.thing(), |y| y.thing())`? + // i don't care whether i'm returning a walkdir error or an io error, i just care about whether or not it + // implements ToString (which they both do). map_or doesn't work on trait objects though :( + error!( + "{}: {}", + path, + err.io_error().map_or(err.to_string(), |e| e.to_string()) + ); + return None; + } + e.ok() + }) + // remove directories from the final list + .filter(|e| !e.file_type().is_dir()) + // if fif is invoked without `-f` on a symlinked directory, it will recurse into the symlink (as desired) and ignore + // any symlinks inside the symlinked root directory. however, the root directory will still be added to `entries` as + // if it were a file to be scanned, and `scan_file` will fail to scan it, adding "Failed to read ~/whatever" to the + // output. to avoid this, we can remove all symlinks from `entries` if `-f` is not set. i know this is kind of + // confusing, but it's honestly kind of hard to explain... maybe a screenshot is better: + // https://i.imgur.com/DYG7jlB.png + // adding the symlink filter removes the line that's being pointed to in the image. 0u0 + .filter(|e| scan_opts.follow_symlinks || !e.file_type().is_symlink()) + .collect(); + + if probably_fatal_error { + None + } else { + Some(entries) + } +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 853e8f3..99bbaea 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,16 +8,11 @@ pub mod inspectors; pub mod parameters; pub mod string_type; pub mod utils; +pub mod files; use cfg_if::cfg_if; use once_cell::sync::OnceCell; -use walkdir::{DirEntry, WalkDir}; -use std::collections::BTreeSet; -use std::path::Path; -use log::{debug, error, warn}; -use mime_guess::from_ext; -use crate::parameters::ScanOpts; -use crate::findings::{Findings, ScanError}; +use crate::findings::Findings; use crate::mime_db::MimeDb; cfg_if! { @@ -30,195 +25,6 @@ cfg_if! { } } -cfg_if! { - if #[cfg(windows)] { - /// Determines whether or not a file is hidden by checking its win32 file attributes. - pub fn is_hidden(entry: &DirEntry) -> bool { - use std::os::windows::prelude::*; - std::fs::metadata(entry.path()) // try to get metadata for file - .map_or( - false, // if getting metadata/attributes fails, assume it's not hidden - |f| f.file_attributes() & 0x2 > 0, // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants - ) - } - } else { - /// Determines whether or not a file is hidden by checking for a leading full stop. - pub fn is_hidden(entry: &DirEntry) -> bool { - entry - .file_name() - .to_str() - .map_or(false, |f| f.starts_with('.') && f != ".") - } - } -} - -/// Returns `true` if a file matches the given criteria. This means checking whether the file's extension appears in -/// `exts` (if specified), potentially skipping over hidden files, and so on. -pub fn wanted_file( - entry: &DirEntry, - exts: Option<&BTreeSet<&str>>, - exclude: Option<&BTreeSet<&str>>, - scan_opts: &ScanOpts, -) -> bool { - if entry.depth() == 0 { - // the root directory should always be scanned. - return true; - } - - if !scan_opts.hidden && is_hidden(entry) { - // skip hidden files and directories. this check is performed first because it's very lightweight. - return false; - } - - if entry.file_type().is_dir() { - // always allow directories - there's no point doing file extension matching on something that isn't a file. - return true; - } - - if let Some(ext) = entry.path().extension() { - // file has extension - discard invalid UTF-8 and normalise it to lowercase. - let ext = ext.to_string_lossy().to_lowercase(); - let ext = ext.as_str(); - - if scan_opts.ignore_unknown_exts && from_ext(ext).is_empty() { - // unknown extension, skip. - return false; - } - - if let Some(exts) = exts { - // only scan if the file has one of the specified extensions. - exts.contains(&ext) - } else { - // no extensions specified - the file should be scanned unless its extension is on the exclude list. - exclude.map_or(true, |exclude| !exclude.contains(&ext)) - } - } else { - // no file extension - scan_opts.extensionless - } -} - -/// Inspects the given entry, returning a [`Findings`] on success and a [`ScanError`] on failure. -/// -/// In the event of an IO error, the returned [`ScanError`] will be of type [`ScanError::File`]. Otherwise, a -/// [`ScanError::Mime`] will be returned, meaning that the file was scanned successfully, but a mimetype could not be -/// determined. -pub fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result { - let path = entry.path(); - // try to determine mimetype for this entry - let result = match inspectors::mime_type(MIMEDB.get().unwrap(), path) { - // an error occurred while trying to read the file - Err(_) => return Err(ScanError::File(path)), - // the file was read successfully, but we were unable to determine its mimetype - Ok(None) => return Err(ScanError::Mime(path)), - // a mimetype was found! - Ok(Some(result)) => result, - }; - - // set of known extensions for the given mimetype - let known_exts = inspectors::mime_extension_lookup(result.essence_str().into()); - // file extension for this particular file - let entry_ext = path.extension(); - - let valid = match known_exts { - // there is a known set of extensions for this mimetype, and the file has an extension - Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_string_lossy().to_lowercase().into()), - // either this file has no extension, or there is no known set of extensions for this mimetype :( - Some(_) | None => false, - }; - - let path = if canonical_paths { - match std::fs::canonicalize(path) { - Ok(path) => path, - Err(_) => return Err(ScanError::File(entry.path())), - } - } else { - path.to_path_buf() // :c - }; - - Ok(Findings { - file: path, - valid, - mime: result, - }) -} - -/// Takes a slice of [`DirEntry`]s and calls [`scan_file`] on each one, returning the results in a vector. -pub fn scan_from_walkdir(entries: &[DirEntry], canonical_paths: bool) -> Vec> { - cfg_if! { - if #[cfg(feature = "multi-threaded")] { - use rayon::prelude::*; - - // split the entries into chunks of 32, and iterate over each chunk of entries in a separate thread - entries - .par_chunks(32) - .flat_map(|chunk| { - chunk - .iter() // iter over the chunk, which is a slice of DirEntry structs - .map(|entry| scan_file(entry, canonical_paths)) - .collect::>() - }) - .collect() - } else { - entries.iter().map(|entry: &DirEntry| scan_file(entry, canonical_paths)).collect() - } - } -} - -/// Scans a given directory with [`WalkDir`], filters with [`wanted_file`], checks for errors, and returns a vector of -/// [DirEntry]s. -pub fn scan_directory( - dirs: &Path, - exts: Option<&BTreeSet<&str>>, - exclude: Option<&BTreeSet<&str>>, - scan_opts: &ScanOpts, -) -> Option> { - let stepper = WalkDir::new(dirs).follow_links(scan_opts.follow_symlinks).into_iter(); - let mut probably_fatal_error = false; - let entries: Vec = stepper - .filter_entry(|e| wanted_file(e, exts, exclude, scan_opts)) // filter out unwanted files - .filter_map(|e| { - if let Err(err) = &e { - debug!("uh oh spaghettio!! {:#?}", e); - // log errors to stdout, and remove them from the iterator - let path = err.path().map_or("General error".into(), Path::to_string_lossy); - - if err.depth() == 0 { - // if something goes wrong while trying to read the root directory, we're probably not going to get much done - probably_fatal_error = true; - } - - // TODO: is there a way to just say `map_or(x, |y| y).thing()` instead of `map_or(x.thing(), |y| y.thing())`? - // i don't care whether i'm returning a walkdir error or an io error, i just care about whether or not it - // implements ToString (which they both do). map_or doesn't work on trait objects though :( - error!( - "{}: {}", - path, - err.io_error().map_or(err.to_string(), |e| e.to_string()) - ); - return None; - } - e.ok() - }) - // remove directories from the final list - .filter(|e| !e.file_type().is_dir()) - // if fif is invoked without `-f` on a symlinked directory, it will recurse into the symlink (as desired) and ignore - // any symlinks inside the symlinked root directory. however, the root directory will still be added to `entries` as - // if it were a file to be scanned, and `scan_file` will fail to scan it, adding "Failed to read ~/whatever" to the - // output. to avoid this, we can remove all symlinks from `entries` if `-f` is not set. i know this is kind of - // confusing, but it's honestly kind of hard to explain... maybe a screenshot is better: - // https://i.imgur.com/DYG7jlB.png - // adding the symlink filter removes the line that's being pointed to in the image. 0u0 - .filter(|e| scan_opts.follow_symlinks || !e.file_type().is_symlink()) - .collect(); - - if probably_fatal_error { - None - } else { - Some(entries) - } -} - /// Initialises [`MIMEDB`] with a value dependent on the current backend. pub fn init_db() { cfg_if! { diff --git a/src/main.rs b/src/main.rs index fbf58fa..9d4b30c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -26,7 +26,8 @@ use log::{debug, error, info, trace, warn, Level}; use fif::formats::Format; use fif::parameters::{OutputFormat}; use fif::utils::{clap_long_version, os_name}; -use fif::{init_db, scan_directory, parameters, formats}; +use fif::{init_db, parameters, formats}; +use fif::files::{scan_directory, scan_from_walkdir}; #[cfg(test)] mod tests; @@ -89,7 +90,7 @@ fn main() { trace!("Found {} items to check", entries.len()); - let results: Vec<_> = fif::scan_from_walkdir(&entries, args.canonical_paths) + let results: Vec<_> = scan_from_walkdir(&entries, args.canonical_paths) .into_iter() .filter( |result| result.is_err() || !result.as_ref().unwrap().valid, diff --git a/src/tests/mod.rs b/src/tests/mod.rs index c921c0b..1b9cc01 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -3,7 +3,7 @@ use fif::formats::{Format, PowerShell, Shell}; use fif::inspectors::{mime_extension_lookup, BUF_SIZE}; use fif::mime_db::MimeDb; use fif::string_type::String; -use fif::{scan_directory, scan_from_walkdir}; +use fif::files::{scan_directory, scan_from_walkdir}; use crate::parameters::Parameters; use clap::Clap;