reorganise files.rs, use lazy Mimes in mime_type

This commit is contained in:
Lynne Megido 2021-11-25 06:26:57 +10:00
parent 667ee440e0
commit ec10b58482
Signed by: lynnesbian
GPG key ID: F0A184B5213D9F90

View file

@ -6,6 +6,7 @@
use std::collections::{BTreeSet, HashMap};
use std::fs::File;
use std::io::{self, Read, Seek, SeekFrom};
use std::ops::Deref;
use std::path::Path;
use std::str::FromStr;
@ -22,10 +23,25 @@ use crate::findings::{Findings, ScanError};
use crate::mime_db::MimeDb;
use crate::parameters::ScanOpts;
use crate::{String, MIMEDB};
use crate::utils::APPLICATION_ZIP;
/// Cache of mimetypes and their associated extensions, used by [`mime_extension_lookup()`]
static MIMEXT: Lazy<RwLock<HashMap<String, Option<Vec<String>>>>> = Lazy::new(|| RwLock::new(HashMap::new()));
/// The number of bytes to read initially when identifying a file's MIME type. Used in the [`mime_type`] function.
///
/// Rather than reading the entire file all at once into a [`BUF_SIZE`] buffer, it tends to be faster to read a small
/// chunk of the file and trying to identify that, proceeding with the larger buffer if that fails. Many file formats
/// can be identified with the first few dozen bytes, so the "happy path" will likely be taken in the majority of cases.
pub const INITIAL_BUF_SIZE: usize = 128;
/// The number of bytes to read if the file couldn't be identified from its first [`INITIAL_BUF_SIZE`] bytes. Used in
/// the [`mime_type`] function.
pub const BUF_SIZE: usize = 8192;
/// A [`Mime`] representing the "application/x-ole-storage" mimetype.
static APPLICATION_X_OLE_STORAGE: Lazy<Mime> = Lazy::new(|| Mime::from_str("application/x-ole-storage").unwrap());
cfg_if! {
if #[cfg(windows)] {
/// Determines whether or not a file is hidden by checking its win32 file attributes.
@ -233,16 +249,6 @@ pub fn scan_directory(
}
}
/// The number of bytes to read initially.
///
/// Rather than reading the entire file all at once into a [`BUF_SIZE`] buffer, it tends to be faster to read a small
/// chunk of the file and trying to identify that, proceeding with the larger buffer if that fails. Many file formats
/// can be identified with the first few dozen bytes, so the "happy path" will likely be taken in the majority of cases.
pub const INITIAL_BUF_SIZE: usize = 128;
/// The number of bytes to read if the file couldn't be identified from its first [`INITIAL_BUF_SIZE`] bytes.
pub const BUF_SIZE: usize = 8192;
/// Tries to identify the mimetype of a file from a given path.
pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
let mut buffer = [0; INITIAL_BUF_SIZE];
@ -278,11 +284,11 @@ pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
// another is ZIP - many file formats (DOCX, ODT, JAR...) are just ZIP files with particular data structures.
// determining that a file is in one of the MS office formats in particular requires looking quite far into the
// file.
&& mime != &Mime::from_str("application/zip").unwrap()
&& mime != APPLICATION_ZIP.deref()
// doc/ppt/xls files are a subset of what's known as an "OLE2 compound document storage", at least according to
// shared-mime-info. if a pre-OOXML era MS office file is scanned and identified as x-ole-storage, reading further
// will allow it to be detected correctly as the appropriate filetype.
&& mime != &Mime::from_str("application/x-ole-storage").unwrap());
&& mime != APPLICATION_X_OLE_STORAGE.deref());
if r.is_some() {
return Ok(r);