new files module

This commit is contained in:
Lynne Megido 2021-08-28 17:58:30 +10:00
parent d625fef106
commit 557f5132ff
Signed by: lynnesbian
GPG key ID: F0A184B5213D9F90
4 changed files with 204 additions and 199 deletions

198
src/files.rs Normal file
View file

@ -0,0 +1,198 @@
use log::{debug, error};
use cfg_if::cfg_if;
use walkdir::{DirEntry, WalkDir};
use std::collections::BTreeSet;
use crate::parameters::ScanOpts;
use mime_guess::from_ext;
use crate::findings::{Findings, ScanError};
use crate::{inspectors, MIMEDB};
use std::path::Path;
cfg_if! {
if #[cfg(windows)] {
/// Determines whether or not a file is hidden by checking its win32 file attributes.
pub fn is_hidden(entry: &DirEntry) -> bool {
use std::os::windows::prelude::*;
std::fs::metadata(entry.path()) // try to get metadata for file
.map_or(
false, // if getting metadata/attributes fails, assume it's not hidden
|f| f.file_attributes() & 0x2 > 0, // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants
)
}
} else {
/// Determines whether or not a file is hidden by checking for a leading full stop.
pub fn is_hidden(entry: &DirEntry) -> bool {
entry
.file_name()
.to_str()
.map_or(false, |f| f.starts_with('.') && f != ".")
}
}
}
/// Returns `true` if a file matches the given criteria. This means checking whether the file's extension appears in
/// `exts` (if specified), potentially skipping over hidden files, and so on.
pub fn wanted_file(
entry: &DirEntry,
exts: Option<&BTreeSet<&str>>,
exclude: Option<&BTreeSet<&str>>,
scan_opts: &ScanOpts,
) -> bool {
if entry.depth() == 0 {
// the root directory should always be scanned.
return true;
}
if !scan_opts.hidden && is_hidden(entry) {
// skip hidden files and directories. this check is performed first because it's very lightweight.
return false;
}
if entry.file_type().is_dir() {
// always allow directories - there's no point doing file extension matching on something that isn't a file.
return true;
}
if let Some(ext) = entry.path().extension() {
// file has extension - discard invalid UTF-8 and normalise it to lowercase.
let ext = ext.to_string_lossy().to_lowercase();
let ext = ext.as_str();
if scan_opts.ignore_unknown_exts && from_ext(ext).is_empty() {
// unknown extension, skip.
return false;
}
if let Some(exts) = exts {
// only scan if the file has one of the specified extensions.
exts.contains(&ext)
} else {
// no extensions specified - the file should be scanned unless its extension is on the exclude list.
exclude.map_or(true, |exclude| !exclude.contains(&ext))
}
} else {
// no file extension
scan_opts.extensionless
}
}
/// Inspects the given entry, returning a [`Findings`] on success and a [`ScanError`] on failure.
///
/// In the event of an IO error, the returned [`ScanError`] will be of type [`ScanError::File`]. Otherwise, a
/// [`ScanError::Mime`] will be returned, meaning that the file was scanned successfully, but a mimetype could not be
/// determined.
pub fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result<Findings, ScanError> {
let path = entry.path();
// try to determine mimetype for this entry
let result = match inspectors::mime_type(MIMEDB.get().unwrap(), path) {
// an error occurred while trying to read the file
Err(_) => return Err(ScanError::File(path)),
// the file was read successfully, but we were unable to determine its mimetype
Ok(None) => return Err(ScanError::Mime(path)),
// a mimetype was found!
Ok(Some(result)) => result,
};
// set of known extensions for the given mimetype
let known_exts = inspectors::mime_extension_lookup(result.essence_str().into());
// file extension for this particular file
let entry_ext = path.extension();
let valid = match known_exts {
// there is a known set of extensions for this mimetype, and the file has an extension
Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_string_lossy().to_lowercase().into()),
// either this file has no extension, or there is no known set of extensions for this mimetype :(
Some(_) | None => false,
};
let path = if canonical_paths {
match std::fs::canonicalize(path) {
Ok(path) => path,
Err(_) => return Err(ScanError::File(entry.path())),
}
} else {
path.to_path_buf() // :c
};
Ok(Findings {
file: path,
valid,
mime: result,
})
}
/// Takes a slice of [`DirEntry`]s and calls [`scan_file`] on each one, returning the results in a vector.
pub fn scan_from_walkdir(entries: &[DirEntry], canonical_paths: bool) -> Vec<Result<Findings, ScanError>> {
cfg_if! {
if #[cfg(feature = "multi-threaded")] {
use rayon::prelude::*;
// split the entries into chunks of 32, and iterate over each chunk of entries in a separate thread
entries
.par_chunks(32)
.flat_map(|chunk| {
chunk
.iter() // iter over the chunk, which is a slice of DirEntry structs
.map(|entry| scan_file(entry, canonical_paths))
.collect::<Vec<_>>()
})
.collect()
} else {
entries.iter().map(|entry: &DirEntry| scan_file(entry, canonical_paths)).collect()
}
}
}
/// Scans a given directory with [`WalkDir`], filters with [`wanted_file`], checks for errors, and returns a vector of
/// [DirEntry]s.
pub fn scan_directory(
dirs: &Path,
exts: Option<&BTreeSet<&str>>,
exclude: Option<&BTreeSet<&str>>,
scan_opts: &ScanOpts,
) -> Option<Vec<DirEntry>> {
let stepper = WalkDir::new(dirs).follow_links(scan_opts.follow_symlinks).into_iter();
let mut probably_fatal_error = false;
let entries: Vec<DirEntry> = stepper
.filter_entry(|e| wanted_file(e, exts, exclude, scan_opts)) // filter out unwanted files
.filter_map(|e| {
if let Err(err) = &e {
debug!("uh oh spaghettio!! {:#?}", e);
// log errors to stdout, and remove them from the iterator
let path = err.path().map_or("General error".into(), Path::to_string_lossy);
if err.depth() == 0 {
// if something goes wrong while trying to read the root directory, we're probably not going to get much done
probably_fatal_error = true;
}
// TODO: is there a way to just say `map_or(x, |y| y).thing()` instead of `map_or(x.thing(), |y| y.thing())`?
// i don't care whether i'm returning a walkdir error or an io error, i just care about whether or not it
// implements ToString (which they both do). map_or doesn't work on trait objects though :(
error!(
"{}: {}",
path,
err.io_error().map_or(err.to_string(), |e| e.to_string())
);
return None;
}
e.ok()
})
// remove directories from the final list
.filter(|e| !e.file_type().is_dir())
// if fif is invoked without `-f` on a symlinked directory, it will recurse into the symlink (as desired) and ignore
// any symlinks inside the symlinked root directory. however, the root directory will still be added to `entries` as
// if it were a file to be scanned, and `scan_file` will fail to scan it, adding "Failed to read ~/whatever" to the
// output. to avoid this, we can remove all symlinks from `entries` if `-f` is not set. i know this is kind of
// confusing, but it's honestly kind of hard to explain... maybe a screenshot is better:
// https://i.imgur.com/DYG7jlB.png
// adding the symlink filter removes the line that's being pointed to in the image. 0u0
.filter(|e| scan_opts.follow_symlinks || !e.file_type().is_symlink())
.collect();
if probably_fatal_error {
None
} else {
Some(entries)
}
}

View file

@ -8,16 +8,11 @@ pub mod inspectors;
pub mod parameters;
pub mod string_type;
pub mod utils;
pub mod files;
use cfg_if::cfg_if;
use once_cell::sync::OnceCell;
use walkdir::{DirEntry, WalkDir};
use std::collections::BTreeSet;
use std::path::Path;
use log::{debug, error, warn};
use mime_guess::from_ext;
use crate::parameters::ScanOpts;
use crate::findings::{Findings, ScanError};
use crate::findings::Findings;
use crate::mime_db::MimeDb;
cfg_if! {
@ -30,195 +25,6 @@ cfg_if! {
}
}
cfg_if! {
if #[cfg(windows)] {
/// Determines whether or not a file is hidden by checking its win32 file attributes.
pub fn is_hidden(entry: &DirEntry) -> bool {
use std::os::windows::prelude::*;
std::fs::metadata(entry.path()) // try to get metadata for file
.map_or(
false, // if getting metadata/attributes fails, assume it's not hidden
|f| f.file_attributes() & 0x2 > 0, // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants
)
}
} else {
/// Determines whether or not a file is hidden by checking for a leading full stop.
pub fn is_hidden(entry: &DirEntry) -> bool {
entry
.file_name()
.to_str()
.map_or(false, |f| f.starts_with('.') && f != ".")
}
}
}
/// Returns `true` if a file matches the given criteria. This means checking whether the file's extension appears in
/// `exts` (if specified), potentially skipping over hidden files, and so on.
pub fn wanted_file(
entry: &DirEntry,
exts: Option<&BTreeSet<&str>>,
exclude: Option<&BTreeSet<&str>>,
scan_opts: &ScanOpts,
) -> bool {
if entry.depth() == 0 {
// the root directory should always be scanned.
return true;
}
if !scan_opts.hidden && is_hidden(entry) {
// skip hidden files and directories. this check is performed first because it's very lightweight.
return false;
}
if entry.file_type().is_dir() {
// always allow directories - there's no point doing file extension matching on something that isn't a file.
return true;
}
if let Some(ext) = entry.path().extension() {
// file has extension - discard invalid UTF-8 and normalise it to lowercase.
let ext = ext.to_string_lossy().to_lowercase();
let ext = ext.as_str();
if scan_opts.ignore_unknown_exts && from_ext(ext).is_empty() {
// unknown extension, skip.
return false;
}
if let Some(exts) = exts {
// only scan if the file has one of the specified extensions.
exts.contains(&ext)
} else {
// no extensions specified - the file should be scanned unless its extension is on the exclude list.
exclude.map_or(true, |exclude| !exclude.contains(&ext))
}
} else {
// no file extension
scan_opts.extensionless
}
}
/// Inspects the given entry, returning a [`Findings`] on success and a [`ScanError`] on failure.
///
/// In the event of an IO error, the returned [`ScanError`] will be of type [`ScanError::File`]. Otherwise, a
/// [`ScanError::Mime`] will be returned, meaning that the file was scanned successfully, but a mimetype could not be
/// determined.
pub fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result<Findings, ScanError> {
let path = entry.path();
// try to determine mimetype for this entry
let result = match inspectors::mime_type(MIMEDB.get().unwrap(), path) {
// an error occurred while trying to read the file
Err(_) => return Err(ScanError::File(path)),
// the file was read successfully, but we were unable to determine its mimetype
Ok(None) => return Err(ScanError::Mime(path)),
// a mimetype was found!
Ok(Some(result)) => result,
};
// set of known extensions for the given mimetype
let known_exts = inspectors::mime_extension_lookup(result.essence_str().into());
// file extension for this particular file
let entry_ext = path.extension();
let valid = match known_exts {
// there is a known set of extensions for this mimetype, and the file has an extension
Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_string_lossy().to_lowercase().into()),
// either this file has no extension, or there is no known set of extensions for this mimetype :(
Some(_) | None => false,
};
let path = if canonical_paths {
match std::fs::canonicalize(path) {
Ok(path) => path,
Err(_) => return Err(ScanError::File(entry.path())),
}
} else {
path.to_path_buf() // :c
};
Ok(Findings {
file: path,
valid,
mime: result,
})
}
/// Takes a slice of [`DirEntry`]s and calls [`scan_file`] on each one, returning the results in a vector.
pub fn scan_from_walkdir(entries: &[DirEntry], canonical_paths: bool) -> Vec<Result<Findings, ScanError>> {
cfg_if! {
if #[cfg(feature = "multi-threaded")] {
use rayon::prelude::*;
// split the entries into chunks of 32, and iterate over each chunk of entries in a separate thread
entries
.par_chunks(32)
.flat_map(|chunk| {
chunk
.iter() // iter over the chunk, which is a slice of DirEntry structs
.map(|entry| scan_file(entry, canonical_paths))
.collect::<Vec<_>>()
})
.collect()
} else {
entries.iter().map(|entry: &DirEntry| scan_file(entry, canonical_paths)).collect()
}
}
}
/// Scans a given directory with [`WalkDir`], filters with [`wanted_file`], checks for errors, and returns a vector of
/// [DirEntry]s.
pub fn scan_directory(
dirs: &Path,
exts: Option<&BTreeSet<&str>>,
exclude: Option<&BTreeSet<&str>>,
scan_opts: &ScanOpts,
) -> Option<Vec<DirEntry>> {
let stepper = WalkDir::new(dirs).follow_links(scan_opts.follow_symlinks).into_iter();
let mut probably_fatal_error = false;
let entries: Vec<DirEntry> = stepper
.filter_entry(|e| wanted_file(e, exts, exclude, scan_opts)) // filter out unwanted files
.filter_map(|e| {
if let Err(err) = &e {
debug!("uh oh spaghettio!! {:#?}", e);
// log errors to stdout, and remove them from the iterator
let path = err.path().map_or("General error".into(), Path::to_string_lossy);
if err.depth() == 0 {
// if something goes wrong while trying to read the root directory, we're probably not going to get much done
probably_fatal_error = true;
}
// TODO: is there a way to just say `map_or(x, |y| y).thing()` instead of `map_or(x.thing(), |y| y.thing())`?
// i don't care whether i'm returning a walkdir error or an io error, i just care about whether or not it
// implements ToString (which they both do). map_or doesn't work on trait objects though :(
error!(
"{}: {}",
path,
err.io_error().map_or(err.to_string(), |e| e.to_string())
);
return None;
}
e.ok()
})
// remove directories from the final list
.filter(|e| !e.file_type().is_dir())
// if fif is invoked without `-f` on a symlinked directory, it will recurse into the symlink (as desired) and ignore
// any symlinks inside the symlinked root directory. however, the root directory will still be added to `entries` as
// if it were a file to be scanned, and `scan_file` will fail to scan it, adding "Failed to read ~/whatever" to the
// output. to avoid this, we can remove all symlinks from `entries` if `-f` is not set. i know this is kind of
// confusing, but it's honestly kind of hard to explain... maybe a screenshot is better:
// https://i.imgur.com/DYG7jlB.png
// adding the symlink filter removes the line that's being pointed to in the image. 0u0
.filter(|e| scan_opts.follow_symlinks || !e.file_type().is_symlink())
.collect();
if probably_fatal_error {
None
} else {
Some(entries)
}
}
/// Initialises [`MIMEDB`] with a value dependent on the current backend.
pub fn init_db() {
cfg_if! {

View file

@ -26,7 +26,8 @@ use log::{debug, error, info, trace, warn, Level};
use fif::formats::Format;
use fif::parameters::{OutputFormat};
use fif::utils::{clap_long_version, os_name};
use fif::{init_db, scan_directory, parameters, formats};
use fif::{init_db, parameters, formats};
use fif::files::{scan_directory, scan_from_walkdir};
#[cfg(test)]
mod tests;
@ -89,7 +90,7 @@ fn main() {
trace!("Found {} items to check", entries.len());
let results: Vec<_> = fif::scan_from_walkdir(&entries, args.canonical_paths)
let results: Vec<_> = scan_from_walkdir(&entries, args.canonical_paths)
.into_iter()
.filter(
|result| result.is_err() || !result.as_ref().unwrap().valid,

View file

@ -3,7 +3,7 @@ use fif::formats::{Format, PowerShell, Shell};
use fif::inspectors::{mime_extension_lookup, BUF_SIZE};
use fif::mime_db::MimeDb;
use fif::string_type::String;
use fif::{scan_directory, scan_from_walkdir};
use fif::files::{scan_directory, scan_from_walkdir};
use crate::parameters::Parameters;
use clap::Clap;