From 97b0a6edaa353d1f02ad852cf68aff4ebd24e213 Mon Sep 17 00:00:00 2001 From: Lynnesbian Date: Fri, 24 Sep 2021 22:02:04 +1000 Subject: [PATCH] replace cached dep with custom hashmap thing i benchmarked it with hyperfine and in terms of performance it's pretty much identical, with a slight (fraction of a percent) advantage to my implementation --- CHANGELOG.md | 1 + Cargo.toml | 2 +- src/files.rs | 153 ++++++++++++++++++++++++++++----------------------- 3 files changed, 86 insertions(+), 70 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 305b7e1..50e73a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ files module, removed string module, etc. "Features" heading into "Added" and "Changed" sections, renaming "Bugfixes" to "Fixed", and removing the headings that (pointlessly?) previously divided the changelog into v0.3, v0.2, and v0.1 - A few minor grammar tweaks and reorganisations +- Replaced [`cached`] dependency with a simple HashMap-backed store ## v0.3.6 - 2021-08-16 ### Other diff --git a/Cargo.toml b/Cargo.toml index 744a5f9..d34b371 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -94,4 +94,4 @@ opt-level = 3 opt-level = 3 [package.metadata] -msrv = "1.43.0" \ No newline at end of file +msrv = "1.43.0" diff --git a/src/files.rs b/src/files.rs index 29f75cf..d4c13a4 100644 --- a/src/files.rs +++ b/src/files.rs @@ -1,15 +1,16 @@ -use std::collections::BTreeSet; +use std::collections::{BTreeSet, HashMap}; use std::fs::File; use std::io; use std::io::{Read, Seek, SeekFrom}; use std::path::Path; use std::str::FromStr; +use std::sync::RwLock; -use cached::cached; use cfg_if::cfg_if; use log::{debug, error}; use mime::Mime; use mime_guess::from_ext; +use once_cell::sync::Lazy; use walkdir::{DirEntry, WalkDir}; use crate::findings::{Findings, ScanError}; @@ -17,6 +18,8 @@ use crate::mime_db::MimeDb; use crate::parameters::ScanOpts; use crate::{String, MIMEDB}; +static MIMEXT: Lazy>>>> = Lazy::new(|| RwLock::new(HashMap::new())); + cfg_if! { if #[cfg(windows)] { /// Determines whether or not a file is hidden by checking its win32 file attributes. @@ -255,73 +258,85 @@ pub fn mime_type(db: &T, path: &Path) -> io::Result> { Ok(db.get_type(&buffer)) } -cached! { - MIMEXT; - fn mime_extension_lookup(essence: String) -> Option> = { - // Returns a list of known extensions for this mime type, if any. - // This function uses the [Mime]'s "essence" rather than the [Mime] itself - mime_guess::get_mime_extensions ignores - // the type suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. Passing the - // essence_str (which includes the suffix) fixes this. - // ↑ this is supposed to be a doc comment, but the cached! macro doesn't support that... i would switch to the - // proc_macro version of cached, but it has a huge number of deps :c - - let essence = essence.as_str(); - let mut exts = mime_guess::get_mime_extensions_str(essence); - if exts.is_none() { - // no matches :c - // mime_guess' database isn't exactly perfect... there are a lot of times where the db will return "some/x-thing" - // but mime_guess only understands "some/thing", or vice-versa. - // so, if there appear to be no extensions, try replacing "some/x-thing" with "some/thing", or "some/thing" with - // "some/x-thing". - if essence.contains("/x-") { - // replace e.g. "application/x-gzip" with "application/gzip" - exts = mime_guess::get_mime_extensions_str(&essence.replace("/x-", "/")); - } else { - // replace e.g. "video/mp2t" with "video/x-mp2t" - exts = mime_guess::get_mime_extensions_str(&essence.replace("/", "/x-")); - } - } - - match exts { - Some(exts) => { - let possible_exts: Vec = exts.iter().map(|e| String::from(*e)).collect(); - - Some(if essence == mime::IMAGE_JPEG.essence_str() { - // possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are - // far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can - // add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif. - [vec![String::from("jpg")], possible_exts].concat() - - } else if essence == mime::TEXT_XML.essence_str() || essence == "application/xml" { - // a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should - // (in my opinion) be "xml". - // there's also another problem: SVG files can easily be misidentified as XML files, because they usually - // *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read - // before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg" - // as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered - // to have valid extensions. - // TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to - // "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its - // extension is classed as application/*+xml, consider it OK - [vec![String::from("xml"), String::from("svg")], possible_exts].concat() - - } else if essence == "application/msword" { - // classic office files considered harmful - vec![String::from("doc"), String::from("xls"), String::from("ppt")] - - } else if essence == "application/zip" { - // neither xdg-mime nor infer seem to be able to detect office XML files properly... - [vec![String::from("zip"), String::from("docx"), String::from("xlsx"), String::from("pptx")], possible_exts].concat() - - } else if essence == "application/x-ms-dos-executable" { - // both .dll and .exe files are given the same mime type... but you definitely don't want to rename one to the - // other! - [vec![String::from("dll"), String::from("exe")], possible_exts].concat() - } else { - possible_exts - }) - }, - None => None +// Returns a list of known extensions for this mime type, if any. +// This function uses the [Mime]'s "essence" rather than the [Mime] itself - mime_guess::get_mime_extensions ignores +// the type suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. Passing the +// essence_str (which includes the suffix) fixes this. +pub fn mime_extension_lookup(essence: String) -> Option> { + if let Ok(cache) = MIMEXT.read() { + if let Some(exts) = cache.get(&essence) { + return exts.clone(); } } + + let essence = essence; + let mut exts = mime_guess::get_mime_extensions_str(essence.as_str()); + if exts.is_none() { + // no matches :c + // mime_guess' database isn't exactly perfect... there are a lot of times where the db will return "some/x-thing" + // but mime_guess only understands "some/thing", or vice-versa. + // so, if there appear to be no extensions, try replacing "some/x-thing" with "some/thing", or "some/thing" with + // "some/x-thing". + if essence.contains("/x-") { + // replace e.g. "application/x-gzip" with "application/gzip" + exts = mime_guess::get_mime_extensions_str(&essence.replace("/x-", "/")); + } else { + // replace e.g. "video/mp2t" with "video/x-mp2t" + exts = mime_guess::get_mime_extensions_str(&essence.replace("/", "/x-")); + } + } + + let exts = match exts { + Some(exts) => { + let possible_exts: Vec = exts.iter().map(|e| String::from(*e)).collect(); + + Some(if essence == mime::IMAGE_JPEG.essence_str() { + // possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are + // far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can + // add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif. + [vec![String::from("jpg")], possible_exts].concat() + } else if essence == mime::TEXT_XML.essence_str() || essence == "application/xml" { + // a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should + // (in my opinion) be "xml". + // there's also another problem: SVG files can easily be misidentified as XML files, because they usually + // *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read + // before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg" + // as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered + // to have valid extensions. + // TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to + // "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its + // extension is classed as application/*+xml, consider it OK + [vec![String::from("xml"), String::from("svg")], possible_exts].concat() + } else if essence == "application/msword" { + // classic office files considered harmful + vec![String::from("doc"), String::from("xls"), String::from("ppt")] + } else if essence == "application/zip" { + // neither xdg-mime nor infer seem to be able to detect office XML files properly... + [ + vec![ + String::from("zip"), + String::from("docx"), + String::from("xlsx"), + String::from("pptx"), + ], + possible_exts, + ] + .concat() + } else if essence == "application/x-ms-dos-executable" { + // both .dll and .exe files are given the same mime type... but you definitely don't want to rename one to the + // other! + [vec![String::from("dll"), String::from("exe")], possible_exts].concat() + } else { + possible_exts + }) + } + None => None, + }; + + if let Ok(mut cache) = MIMEXT.write() { + cache.insert(essence, exts.clone()); + exts + } else { + unreachable!() + } }