fif/files.rs at 97b0a6edaa353d1f02ad852cf68aff4ebd24e213

fif

343 lines

14 KiB

Rust

Raw Blame History

 use std::collections::{BTreeSet, HashMap};
 use std::fs::File;
 use std::io;
 use std::io::{Read, Seek, SeekFrom};
 use std::path::Path;
 use std::str::FromStr;
 use std::sync::RwLock;
 use cfg_if::cfg_if;
 use log::{debug, error};
 use mime::Mime;
 use mime_guess::from_ext;
 use once_cell::sync::Lazy;
 use walkdir::{DirEntry, WalkDir};
 use crate::findings::{Findings, ScanError};
 use crate::mime_db::MimeDb;
 use crate::parameters::ScanOpts;
 use crate::{String, MIMEDB};
 static MIMEXT: Lazy<RwLock<HashMap<String, Option<Vec<String>>>>> = Lazy::new(|| RwLock::new(HashMap::new()));
 cfg_if! {
 	if #[cfg(windows)] {
 		/// Determines whether or not a file is hidden by checking its win32 file attributes.
 		pub fn is_hidden(entry: &DirEntry) -> bool {
 		use std::os::windows::prelude::*;
 		std::fs::metadata(entry.path()) // try to get metadata for file
 			.map_or(
 				false,                             // if getting metadata/attributes fails, assume it's not hidden
 				|f| f.file_attributes() & 0x2 > 0, // flag for hidden - https://docs.microsoft.com/windows/win32/fileio/file-attribute-constants
 			)
 		}
 	} else {
 		/// Determines whether or not a file is hidden by checking for a leading full stop.
 		pub fn is_hidden(entry: &DirEntry) -> bool {
 		entry
 			.file_name()
 			.to_str()
 			.map_or(false, |f| f.starts_with('.') && f != ".")
 		}
 	}
 }
 /// Returns `true` if a file matches the given criteria. This means checking whether the file's extension appears in
 /// `exts` (if specified), potentially skipping over hidden files, and so on.
 pub fn wanted_file(
 	entry: &DirEntry,
 	exts: Option<&BTreeSet<&str>>,
 	exclude: Option<&BTreeSet<&str>>,
 	scan_opts: &ScanOpts,
 ) -> bool {
 	if entry.depth() == 0 {
 		// the root directory should always be scanned.
 		return true;
 	}
 	if !scan_opts.hidden && is_hidden(entry) {
 		// skip hidden files and directories. this check is performed first because it's very lightweight.
 		return false;
 	}
 	if entry.file_type().is_dir() {
 		// always allow directories - there's no point doing file extension matching on something that isn't a file.
 		return true;
 	}
 	if let Some(ext) = entry.path().extension() {
 		// file has extension - discard invalid UTF-8 and normalise it to lowercase.
 		let ext = ext.to_string_lossy().to_lowercase();
 		let ext = ext.as_str();
 		if scan_opts.ignore_unknown_exts && from_ext(ext).is_empty() {
 			// unknown extension, skip.
 			return false;
 		}
 		if let Some(exts) = exts {
 			// only scan if the file has one of the specified extensions.
 			exts.contains(&ext)
 		} else {
 			// no extensions specified - the file should be scanned unless its extension is on the exclude list.
 			exclude.map_or(true, |exclude| !exclude.contains(&ext))
 		}
 	} else {
 		// no file extension
 		scan_opts.extensionless
 	}
 }
 /// Inspects the given entry, returning a [`Findings`] on success and a [`ScanError`] on failure.
 ///
 /// In the event of an IO error, the returned [`ScanError`] will be of type [`ScanError::File`]. Otherwise, a
 /// [`ScanError::Mime`] will be returned, meaning that the file was scanned successfully, but a mimetype could not be
 /// determined.
 pub fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result<Findings, ScanError> {
 	let path = entry.path();
 	// try to determine mimetype for this entry
 	let result = match mime_type(MIMEDB.get().unwrap(), path) {
 		// an error occurred while trying to read the file
 		Err(_) => return Err(ScanError::File(path)),
 		// the file was read successfully, but we were unable to determine its mimetype
 		Ok(None) => return Err(ScanError::Mime(path)),
 		// a mimetype was found!
 		Ok(Some(result)) => result,
 	};
 	// set of known extensions for the given mimetype
 	let known_exts = mime_extension_lookup(result.essence_str().into());
 	// file extension for this particular file
 	let entry_ext = path.extension();
 	let valid = match known_exts {
 		// there is a known set of extensions for this mimetype, and the file has an extension
 		Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_string_lossy().to_lowercase().into()),
 		// either this file has no extension, or there is no known set of extensions for this mimetype :(
 		Some(_) | None => false,
 	};
 	let path = if canonical_paths {
 		match std::fs::canonicalize(path) {
 			Ok(path) => path,
 			Err(_) => return Err(ScanError::File(entry.path())),
 		}
 	} else {
 		path.to_path_buf() // :c
 	};
 	Ok(Findings {
 		file: path,
 		valid,
 		mime: result,
 	})
 }
 /// Takes a slice of [`DirEntry`]s and calls [`scan_file`] on each one, returning the results in a vector.
 pub fn scan_from_walkdir(entries: &[DirEntry], canonical_paths: bool) -> Vec<Result<Findings, ScanError>> {
 	cfg_if! {
 		if #[cfg(feature = "multi-threaded")] {
 			use rayon::prelude::*;
 			// split the entries into chunks of 32, and iterate over each chunk of entries in a separate thread
 			entries
 				.par_chunks(32)
 				.flat_map(|chunk| {
 					chunk
 						.iter() // iter over the chunk, which is a slice of DirEntry structs
 						.map(|entry| scan_file(entry, canonical_paths))
 						.collect::<Vec<_>>()
 				})
 				.collect()
 		} else {
 			entries.iter().map(|entry: &DirEntry| scan_file(entry, canonical_paths)).collect()
 		}
 	}
 }
 /// Scans a given directory with [`WalkDir`], filters with [`wanted_file`], checks for errors, and returns a vector of
 /// [DirEntry]s.
 pub fn scan_directory(
 	dirs: &Path,
 	exts: Option<&BTreeSet<&str>>,
 	exclude: Option<&BTreeSet<&str>>,
 	scan_opts: &ScanOpts,
 ) -> Option<Vec<DirEntry>> {
 	let stepper = WalkDir::new(dirs).follow_links(scan_opts.follow_symlinks).into_iter();
 	let mut probably_fatal_error = false;
 	let entries: Vec<DirEntry> = stepper
 		.filter_entry(|e| wanted_file(e, exts, exclude, scan_opts)) // filter out unwanted files
 		.filter_map(|e| {
 			if let Err(err) = &e {
 				debug!("uh oh spaghettio!! {:#?}", e);
 				// log errors to stdout, and remove them from the iterator
 				let path = err.path().map_or("General error".into(), Path::to_string_lossy);
 				if err.depth() == 0 {
 					// if something goes wrong while trying to read the root directory, we're probably not going to get much done
 					probably_fatal_error = true;
 				}
 				// TODO: is there a way to just say `map_or(x, |y| y).thing()` instead of `map_or(x.thing(), |y| y.thing())`?
 				// i don't care whether i'm returning a walkdir error or an io error, i just care about whether or not it
 				// implements ToString (which they both do). map_or doesn't work on trait objects though :(
 				error!(
 					"{}: {}",
 					path,
 					err.io_error().map_or(err.to_string(), |e| e.to_string())
 				);
 				return None;
 			}
 			e.ok()
 		})
 		// remove directories from the final list
 		.filter(|e| !e.file_type().is_dir())
 		// if fif is invoked without `-f` on a symlinked directory, it will recurse into the symlink (as desired) and ignore
 		// any symlinks inside the symlinked root directory. however, the root directory will still be added to `entries` as
 		// if it were a file to be scanned, and `scan_file` will fail to scan it, adding "Failed to read ~/whatever" to the
 		// output. to avoid this, we can remove all symlinks from `entries` if `-f` is not set. i know this is kind of
 		// confusing, but it's honestly kind of hard to explain... maybe a screenshot is better:
 		// https://i.imgur.com/DYG7jlB.png
 		// adding the symlink filter removes the line that's being pointed to in the image. 0u0
 		.filter(|e| scan_opts.follow_symlinks || !e.file_type().is_symlink())
 		.collect();
 	if probably_fatal_error {
 		None
 	} else {
 		Some(entries)
 	}
 }
 /// The number of bytes to read initially.
 ///
 /// Rather than reading the entire file all at once into a [`BUF_SIZE`] buffer, it tends to be faster to read a small
 /// chunk of the file and trying to identify that, proceeding with the larger buffer if that fails. Many file formats
 /// can be identified with the first few dozen bytes, so the "happy path" will likely be taken in the majority of cases.
 pub const INITIAL_BUF_SIZE: usize = 128;
 /// The number of bytes to read if the file couldn't be identified from its first [`INITIAL_BUF_SIZE`] bytes.
 pub const BUF_SIZE: usize = 8192;
 /// Tries to identify the mimetype of a file from a given path.
 pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
 	let mut buffer = [0; INITIAL_BUF_SIZE];
 	let mut file = File::open(path)?;
 	// read a small amount to start with
 	file.read(&mut buffer)?;
 	let r = db.get_type(&buffer).filter(|mime|
 		// some mime types should be investigated further, reading up to BUF_SIZE even if they've been determined already
 		// one such type is XML - there's many more specific types that can be determined by reading further (such as SVG)
 		mime != &mime::TEXT_XML
 			// another is ZIP - many file formats (DOCX, ODT, JAR...) are just ZIP files with particular data structures.
 			// determining that a file is in one of the MS office formats in particular requires looking quite far into the
 			// file.
 			&& mime != &Mime::from_str("application/zip").unwrap()
 			// doc/ppt/xls files are a subset of what's known as an "OLE2 compound document storage", at least according to
 			// shared-mime-info. if a pre-OOXML era MS office file is scanned and identified as x-ole-storage, reading further
 			// will allow it to be detected correctly as the appropriate filetype.
 			&& mime != &Mime::from_str("application/x-ole-storage").unwrap());
 	if r.is_some() {
 		return Ok(r);
 	}
 	// attempt to read up to the BUF_SIZE bytes of the file.
 	// we've already read the first 128 bytes into a buffer, but i can't see an obvious way to reuse them in a way that's
 	// faster than simply moving the seek position back to the start of the file and re-reading the whole BUF_SIZE bytes.
 	// for example, reading (BUF_SIZE - INITIAL_BUF_SIZE) bytes, then concatenating the original INITIAL_BUF_SIZE buffer
 	// with this new one would presumably be faster - but it's not. i think it's more expensive to create the iterator,
 	// collect the contents, etc. i'll have to look into this more. i don't at all doubt that there IS a way to do this
 	// efficiently, and i can think of a way in principle, but i'm not sure how to express it in a way that is both
 	// idiomatic/safe and fast.
 	let mut buffer = [0; BUF_SIZE];
 	file.seek(SeekFrom::Start(0))?;
 	file.read(&mut buffer)?;
 	Ok(db.get_type(&buffer))
 }
 // Returns a list of known extensions for this mime type, if any.
 // This function uses the [Mime]'s "essence" rather than the [Mime] itself - mime_guess::get_mime_extensions ignores
 // the type suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. Passing the
 // essence_str (which includes the suffix) fixes this.
 pub fn mime_extension_lookup(essence: String) -> Option<Vec<String>> {
 	if let Ok(cache) = MIMEXT.read() {
 		if let Some(exts) = cache.get(&essence) {
 			return exts.clone();
 		}
 	}
 	let essence = essence;
 	let mut exts = mime_guess::get_mime_extensions_str(essence.as_str());
 	if exts.is_none() {
 		// no matches :c
 		// mime_guess' database isn't exactly perfect... there are a lot of times where the db will return "some/x-thing"
 		// but mime_guess only understands "some/thing", or vice-versa.
 		// so, if there appear to be no extensions, try replacing "some/x-thing" with "some/thing", or "some/thing" with
 		// "some/x-thing".
 		if essence.contains("/x-") {
 			// replace e.g. "application/x-gzip" with "application/gzip"
 			exts = mime_guess::get_mime_extensions_str(&essence.replace("/x-", "/"));
 		} else {
 			// replace e.g. "video/mp2t" with "video/x-mp2t"
 			exts = mime_guess::get_mime_extensions_str(&essence.replace("/", "/x-"));
 		}
 	}
 	let exts = match exts {
 		Some(exts) => {
 			let possible_exts: Vec<String> = exts.iter().map(|e| String::from(*e)).collect();
 			Some(if essence == mime::IMAGE_JPEG.essence_str() {
 				// possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are
 				// far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can
 				// add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif.
 				[vec![String::from("jpg")], possible_exts].concat()
 			} else if essence == mime::TEXT_XML.essence_str() || essence == "application/xml" {
 				// a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should
 				// (in my opinion) be "xml".
 				// there's also another problem: SVG files can easily be misidentified as XML files, because they usually
 				// *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read
 				// before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg"
 				// as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered
 				// to have valid extensions.
 				// TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to
 				// "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its
 				// extension is classed as application/*+xml, consider it OK
 				[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
 			} else if essence == "application/msword" {
 				// classic office files considered harmful
 				vec![String::from("doc"), String::from("xls"), String::from("ppt")]
 			} else if essence == "application/zip" {
 				// neither xdg-mime nor infer seem to be able to detect office XML files properly...
 				[
 					vec![
 						String::from("zip"),
 						String::from("docx"),
 						String::from("xlsx"),
 						String::from("pptx"),
 					],
 					possible_exts,
 				]
 				.concat()
 			} else if essence == "application/x-ms-dos-executable" {
 				// both .dll and .exe files are given the same mime type... but you definitely don't want to rename one to the
 				// other!
 				[vec![String::from("dll"), String::from("exe")], possible_exts].concat()
 			} else {
 				possible_exts
 			})
 		}
 		None => None,
 	};
 	if let Ok(mut cache) = MIMEXT.write() {
 		cache.insert(essence, exts.clone());
 		exts
 	} else {
 		unreachable!()
 	}
 }

343 lines 14 KiB Rust Raw Blame History

343 lines

14 KiB

Rust

Raw Blame History