reorganise files.rs, use lazy Mimes in mime_type
This commit is contained in:
parent
667ee440e0
commit
ec10b58482
1 changed files with 18 additions and 12 deletions
30
src/files.rs
30
src/files.rs
|
@ -6,6 +6,7 @@
|
|||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::fs::File;
|
||||
use std::io::{self, Read, Seek, SeekFrom};
|
||||
use std::ops::Deref;
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
|
||||
|
@ -22,10 +23,25 @@ use crate::findings::{Findings, ScanError};
|
|||
use crate::mime_db::MimeDb;
|
||||
use crate::parameters::ScanOpts;
|
||||
use crate::{String, MIMEDB};
|
||||
use crate::utils::APPLICATION_ZIP;
|
||||
|
||||
/// Cache of mimetypes and their associated extensions, used by [`mime_extension_lookup()`]
|
||||
static MIMEXT: Lazy<RwLock<HashMap<String, Option<Vec<String>>>>> = Lazy::new(|| RwLock::new(HashMap::new()));
|
||||
|
||||
/// The number of bytes to read initially when identifying a file's MIME type. Used in the [`mime_type`] function.
|
||||
///
|
||||
/// Rather than reading the entire file all at once into a [`BUF_SIZE`] buffer, it tends to be faster to read a small
|
||||
/// chunk of the file and trying to identify that, proceeding with the larger buffer if that fails. Many file formats
|
||||
/// can be identified with the first few dozen bytes, so the "happy path" will likely be taken in the majority of cases.
|
||||
pub const INITIAL_BUF_SIZE: usize = 128;
|
||||
|
||||
/// The number of bytes to read if the file couldn't be identified from its first [`INITIAL_BUF_SIZE`] bytes. Used in
|
||||
/// the [`mime_type`] function.
|
||||
pub const BUF_SIZE: usize = 8192;
|
||||
|
||||
/// A [`Mime`] representing the "application/x-ole-storage" mimetype.
|
||||
static APPLICATION_X_OLE_STORAGE: Lazy<Mime> = Lazy::new(|| Mime::from_str("application/x-ole-storage").unwrap());
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(windows)] {
|
||||
/// Determines whether or not a file is hidden by checking its win32 file attributes.
|
||||
|
@ -233,16 +249,6 @@ pub fn scan_directory(
|
|||
}
|
||||
}
|
||||
|
||||
/// The number of bytes to read initially.
|
||||
///
|
||||
/// Rather than reading the entire file all at once into a [`BUF_SIZE`] buffer, it tends to be faster to read a small
|
||||
/// chunk of the file and trying to identify that, proceeding with the larger buffer if that fails. Many file formats
|
||||
/// can be identified with the first few dozen bytes, so the "happy path" will likely be taken in the majority of cases.
|
||||
pub const INITIAL_BUF_SIZE: usize = 128;
|
||||
|
||||
/// The number of bytes to read if the file couldn't be identified from its first [`INITIAL_BUF_SIZE`] bytes.
|
||||
pub const BUF_SIZE: usize = 8192;
|
||||
|
||||
/// Tries to identify the mimetype of a file from a given path.
|
||||
pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
|
||||
let mut buffer = [0; INITIAL_BUF_SIZE];
|
||||
|
@ -278,11 +284,11 @@ pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
|
|||
// another is ZIP - many file formats (DOCX, ODT, JAR...) are just ZIP files with particular data structures.
|
||||
// determining that a file is in one of the MS office formats in particular requires looking quite far into the
|
||||
// file.
|
||||
&& mime != &Mime::from_str("application/zip").unwrap()
|
||||
&& mime != APPLICATION_ZIP.deref()
|
||||
// doc/ppt/xls files are a subset of what's known as an "OLE2 compound document storage", at least according to
|
||||
// shared-mime-info. if a pre-OOXML era MS office file is scanned and identified as x-ole-storage, reading further
|
||||
// will allow it to be detected correctly as the appropriate filetype.
|
||||
&& mime != &Mime::from_str("application/x-ole-storage").unwrap());
|
||||
&& mime != APPLICATION_X_OLE_STORAGE.deref());
|
||||
|
||||
if r.is_some() {
|
||||
return Ok(r);
|
||||
|
|
Loading…
Reference in a new issue