use std::collections::{BTreeSet, HashMap}; use std::fs::File; use std::io; use std::io::{Read, Seek, SeekFrom}; use std::ops::Deref; use std::path::Path; use std::str::FromStr; use std::sync::RwLock; use cfg_if::cfg_if; use log::{debug, error}; use mime::Mime; use mime_guess::from_ext; use once_cell::sync::Lazy; use walkdir::{DirEntry, WalkDir}; use crate::findings::{Findings, ScanError}; use crate::mime_db::MimeDb; use crate::parameters::ScanOpts; use crate::{String, MIMEDB}; static MIMEXT: Lazy>>>> = Lazy::new(|| RwLock::new(HashMap::new())); cfg_if! { if #[cfg(windows)] { /// Determines whether or not a file is hidden by checking its win32 file attributes. pub fn is_hidden(entry: &DirEntry) -> bool { use std::os::windows::prelude::*; std::fs::metadata(entry.path()) // try to get metadata for file .map_or( false, // if getting metadata/attributes fails, assume it's not hidden |f| f.file_attributes() & 0x2 > 0, // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants ) } } else { /// Determines whether or not a file is hidden by checking for a leading full stop. pub fn is_hidden(entry: &DirEntry) -> bool { entry .file_name() .to_str() .map_or(false, |f| f.starts_with('.') && f != ".") } } } /// Returns `true` if a file matches the given criteria. This means checking whether the file's extension appears in /// `exts` (if specified), potentially skipping over hidden files, and so on. pub fn wanted_file( entry: &DirEntry, exts: Option<&BTreeSet<&str>>, exclude: Option<&BTreeSet<&str>>, scan_opts: &ScanOpts, ) -> bool { if entry.depth() == 0 { // the root directory should always be scanned. return true; } if !scan_opts.hidden && is_hidden(entry) { // skip hidden files and directories. this check is performed first because it's very lightweight. return false; } if entry.file_type().is_dir() { // always allow directories - there's no point doing file extension matching on something that isn't a file. return true; } if let Some(ext) = entry.path().extension() { // file has extension - discard invalid UTF-8 and normalise it to lowercase. let ext = ext.to_string_lossy().to_lowercase(); let ext = ext.as_str(); if scan_opts.ignore_unknown_exts && from_ext(ext).is_empty() { // unknown extension, skip. return false; } if let Some(exts) = exts { // only scan if the file has one of the specified extensions. exts.contains(&ext) } else { // no extensions specified - the file should be scanned unless its extension is on the exclude list. exclude.map_or(true, |exclude| !exclude.contains(&ext)) } } else { // no file extension scan_opts.extensionless } } /// Inspects the given entry, returning a [`Findings`] on success and a [`ScanError`] on failure. /// /// In the event of an IO error, the returned [`ScanError`] will be of type [`ScanError::File`]. Otherwise, a /// [`ScanError::Mime`] will be returned, meaning that the file was scanned successfully, but a mimetype could not be /// determined. pub fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result { let path = entry.path(); // try to determine mimetype for this entry let result = match mime_type(MIMEDB.deref(), path) { // an error occurred while trying to read the file Err(_) => return Err(ScanError::File(path)), // the file was read successfully, but we were unable to determine its mimetype Ok(None) => return Err(ScanError::Mime(path)), // a mimetype was found! Ok(Some(result)) => result, }; // set of known extensions for the given mimetype let known_exts = mime_extension_lookup(result.essence_str().into()); // file extension for this particular file let entry_ext = path.extension(); let valid = match known_exts { // there is a known set of extensions for this mimetype, and the file has an extension Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_string_lossy().to_lowercase().into()), // either this file has no extension, or there is no known set of extensions for this mimetype :( Some(_) | None => false, }; let path = if canonical_paths { match std::fs::canonicalize(path) { Ok(path) => path, Err(_) => return Err(ScanError::File(entry.path())), } } else { path.to_path_buf() // :c }; Ok(Findings { file: path, valid, mime: result, }) } /// Takes a slice of [`DirEntry`]s and calls [`scan_file`] on each one, returning the results in a vector. pub fn scan_from_walkdir(entries: &[DirEntry], canonical_paths: bool) -> Vec> { cfg_if! { if #[cfg(feature = "multi-threaded")] { use rayon::prelude::*; // split the entries into chunks of 32, and iterate over each chunk of entries in a separate thread entries .par_chunks(32) .flat_map(|chunk| { chunk .iter() // iter over the chunk, which is a slice of DirEntry structs .map(|entry| scan_file(entry, canonical_paths)) .collect::>() }) .collect() } else { entries.iter().map(|entry: &DirEntry| scan_file(entry, canonical_paths)).collect() } } } /// Scans a given directory with [`WalkDir`], filters with [`wanted_file`], checks for errors, and returns a vector of /// [DirEntry]s. pub fn scan_directory( dirs: &Path, exts: Option<&BTreeSet<&str>>, exclude: Option<&BTreeSet<&str>>, scan_opts: &ScanOpts, ) -> Option> { let stepper = WalkDir::new(dirs).follow_links(scan_opts.follow_symlinks).into_iter(); let mut probably_fatal_error = false; let entries: Vec = stepper .filter_entry(|e| wanted_file(e, exts, exclude, scan_opts)) // filter out unwanted files .filter_map(|e| { if let Err(err) = &e { debug!("uh oh spaghettio!! {:#?}", e); // log errors to stdout, and remove them from the iterator let path = err.path().map_or("General error".into(), Path::to_string_lossy); if err.depth() == 0 { // if something goes wrong while trying to read the root directory, we're probably not going to get much done probably_fatal_error = true; } // TODO: is there a way to just say `map_or(x, |y| y).thing()` instead of `map_or(x.thing(), |y| y.thing())`? // i don't care whether i'm returning a walkdir error or an io error, i just care about whether or not it // implements ToString (which they both do). map_or doesn't work on trait objects though :( error!( "{}: {}", path, err.io_error().map_or(err.to_string(), |e| e.to_string()) ); return None; } e.ok() }) // remove directories from the final list .filter(|e| !e.file_type().is_dir()) // if fif is invoked without `-f` on a symlinked directory, it will recurse into the symlink (as desired) and ignore // any symlinks inside the symlinked root directory. however, the root directory will still be added to `entries` as // if it were a file to be scanned, and `scan_file` will fail to scan it, adding "Failed to read ~/whatever" to the // output. to avoid this, we can remove all symlinks from `entries` if `-f` is not set. i know this is kind of // confusing, but it's honestly kind of hard to explain... maybe a screenshot is better: // https://i.imgur.com/DYG7jlB.png // adding the symlink filter removes the line that's being pointed to in the image. 0u0 .filter(|e| scan_opts.follow_symlinks || !e.file_type().is_symlink()) .collect(); if probably_fatal_error { None } else { Some(entries) } } /// The number of bytes to read initially. /// /// Rather than reading the entire file all at once into a [`BUF_SIZE`] buffer, it tends to be faster to read a small /// chunk of the file and trying to identify that, proceeding with the larger buffer if that fails. Many file formats /// can be identified with the first few dozen bytes, so the "happy path" will likely be taken in the majority of cases. pub const INITIAL_BUF_SIZE: usize = 128; /// The number of bytes to read if the file couldn't be identified from its first [`INITIAL_BUF_SIZE`] bytes. pub const BUF_SIZE: usize = 8192; /// Tries to identify the mimetype of a file from a given path. pub fn mime_type(db: &T, path: &Path) -> io::Result> { let mut buffer = [0; INITIAL_BUF_SIZE]; let mut file = File::open(path)?; // read a small amount to start with file.read(&mut buffer)?; let r = db.get_type(&buffer).filter(|mime| // some mime types should be investigated further, reading up to BUF_SIZE even if they've been determined already // one such type is XML - there's many more specific types that can be determined by reading further (such as SVG) mime != &mime::TEXT_XML // another is ZIP - many file formats (DOCX, ODT, JAR...) are just ZIP files with particular data structures. // determining that a file is in one of the MS office formats in particular requires looking quite far into the // file. && mime != &Mime::from_str("application/zip").unwrap() // doc/ppt/xls files are a subset of what's known as an "OLE2 compound document storage", at least according to // shared-mime-info. if a pre-OOXML era MS office file is scanned and identified as x-ole-storage, reading further // will allow it to be detected correctly as the appropriate filetype. && mime != &Mime::from_str("application/x-ole-storage").unwrap()); if r.is_some() { return Ok(r); } // attempt to read up to the BUF_SIZE bytes of the file. // we've already read the first 128 bytes into a buffer, but i can't see an obvious way to reuse them in a way that's // faster than simply moving the seek position back to the start of the file and re-reading the whole BUF_SIZE bytes. // for example, reading (BUF_SIZE - INITIAL_BUF_SIZE) bytes, then concatenating the original INITIAL_BUF_SIZE buffer // with this new one would presumably be faster - but it's not. i think it's more expensive to create the iterator, // collect the contents, etc. i'll have to look into this more. i don't at all doubt that there IS a way to do this // efficiently, and i can think of a way in principle, but i'm not sure how to express it in a way that is both // idiomatic/safe and fast. let mut buffer = [0; BUF_SIZE]; file.seek(SeekFrom::Start(0))?; file.read(&mut buffer)?; Ok(db.get_type(&buffer)) } // Returns a list of known extensions for this mime type, if any. // This function uses the [Mime]'s "essence" rather than the [Mime] itself - mime_guess::get_mime_extensions ignores // the type suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. Passing the // essence_str (which includes the suffix) fixes this. pub fn mime_extension_lookup(essence: String) -> Option> { if let Ok(cache) = MIMEXT.read() { if let Some(exts) = cache.get(&essence) { return exts.clone(); } } let essence = essence; let mut exts = mime_guess::get_mime_extensions_str(essence.as_str()); if exts.is_none() { // no matches :c // mime_guess' database isn't exactly perfect... there are a lot of times where the db will return "some/x-thing" // but mime_guess only understands "some/thing", or vice-versa. // so, if there appear to be no extensions, try replacing "some/x-thing" with "some/thing", or "some/thing" with // "some/x-thing". if essence.contains("/x-") { // replace e.g. "application/x-gzip" with "application/gzip" exts = mime_guess::get_mime_extensions_str(&essence.replace("/x-", "/")); } else { // replace e.g. "video/mp2t" with "video/x-mp2t" exts = mime_guess::get_mime_extensions_str(&essence.replace("/", "/x-")); } } let exts = match exts { Some(exts) => { let possible_exts: Vec = exts.iter().map(|e| String::from(*e)).collect(); Some(if essence == mime::IMAGE_JPEG.essence_str() { // possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are // far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can // add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif. [vec![String::from("jpg")], possible_exts].concat() } else if essence == mime::TEXT_XML.essence_str() || essence == "application/xml" { // a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should // (in my opinion) be "xml". // there's also another problem: SVG files can easily be misidentified as XML files, because they usually // *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read // before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg" // as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered // to have valid extensions. // TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to // "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its // extension is classed as application/*+xml, consider it OK [vec![String::from("xml"), String::from("svg")], possible_exts].concat() } else if essence == "application/msword" { // classic office files considered harmful vec![String::from("doc"), String::from("xls"), String::from("ppt")] } else if essence == "application/zip" { // neither xdg-mime nor infer seem to be able to detect office XML files properly... [ vec![ String::from("zip"), String::from("docx"), String::from("xlsx"), String::from("pptx"), ], possible_exts, ] .concat() } else if essence == "application/x-ms-dos-executable" { // both .dll and .exe files are given the same mime type... but you definitely don't want to rename one to the // other! [vec![String::from("dll"), String::from("exe")], possible_exts].concat() } else { possible_exts }) } None => None, }; if let Ok(mut cache) = MIMEXT.write() { cache.insert(essence, exts.clone()); exts } else { unreachable!() } }