replace cached dep with custom hashmap thing

i benchmarked it with hyperfine and in terms of performance it's pretty much identical, with a slight (fraction of a percent) advantage to my implementation
This commit is contained in:
Lynne Megido 2021-09-24 22:02:04 +10:00
parent 3d41183f1c
commit 97b0a6edaa
Signed by: lynnesbian
GPG key ID: F0A184B5213D9F90
3 changed files with 86 additions and 70 deletions

View file

@ -16,6 +16,7 @@ files module, removed string module, etc.
"Features" heading into "Added" and "Changed" sections, renaming "Bugfixes" to "Fixed", and removing the
headings that (pointlessly?) previously divided the changelog into v0.3, v0.2, and v0.1
- A few minor grammar tweaks and reorganisations
- Replaced [`cached`] dependency with a simple HashMap-backed store
## v0.3.6 - 2021-08-16
### Other

View file

@ -94,4 +94,4 @@ opt-level = 3
opt-level = 3
[package.metadata]
msrv = "1.43.0"
msrv = "1.43.0"

View file

@ -1,15 +1,16 @@
use std::collections::BTreeSet;
use std::collections::{BTreeSet, HashMap};
use std::fs::File;
use std::io;
use std::io::{Read, Seek, SeekFrom};
use std::path::Path;
use std::str::FromStr;
use std::sync::RwLock;
use cached::cached;
use cfg_if::cfg_if;
use log::{debug, error};
use mime::Mime;
use mime_guess::from_ext;
use once_cell::sync::Lazy;
use walkdir::{DirEntry, WalkDir};
use crate::findings::{Findings, ScanError};
@ -17,6 +18,8 @@ use crate::mime_db::MimeDb;
use crate::parameters::ScanOpts;
use crate::{String, MIMEDB};
static MIMEXT: Lazy<RwLock<HashMap<String, Option<Vec<String>>>>> = Lazy::new(|| RwLock::new(HashMap::new()));
cfg_if! {
if #[cfg(windows)] {
/// Determines whether or not a file is hidden by checking its win32 file attributes.
@ -255,73 +258,85 @@ pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
Ok(db.get_type(&buffer))
}
cached! {
MIMEXT;
fn mime_extension_lookup(essence: String) -> Option<Vec<String>> = {
// Returns a list of known extensions for this mime type, if any.
// This function uses the [Mime]'s "essence" rather than the [Mime] itself - mime_guess::get_mime_extensions ignores
// the type suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. Passing the
// essence_str (which includes the suffix) fixes this.
// ↑ this is supposed to be a doc comment, but the cached! macro doesn't support that... i would switch to the
// proc_macro version of cached, but it has a huge number of deps :c
let essence = essence.as_str();
let mut exts = mime_guess::get_mime_extensions_str(essence);
if exts.is_none() {
// no matches :c
// mime_guess' database isn't exactly perfect... there are a lot of times where the db will return "some/x-thing"
// but mime_guess only understands "some/thing", or vice-versa.
// so, if there appear to be no extensions, try replacing "some/x-thing" with "some/thing", or "some/thing" with
// "some/x-thing".
if essence.contains("/x-") {
// replace e.g. "application/x-gzip" with "application/gzip"
exts = mime_guess::get_mime_extensions_str(&essence.replace("/x-", "/"));
} else {
// replace e.g. "video/mp2t" with "video/x-mp2t"
exts = mime_guess::get_mime_extensions_str(&essence.replace("/", "/x-"));
}
}
match exts {
Some(exts) => {
let possible_exts: Vec<String> = exts.iter().map(|e| String::from(*e)).collect();
Some(if essence == mime::IMAGE_JPEG.essence_str() {
// possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are
// far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can
// add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif.
[vec![String::from("jpg")], possible_exts].concat()
} else if essence == mime::TEXT_XML.essence_str() || essence == "application/xml" {
// a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should
// (in my opinion) be "xml".
// there's also another problem: SVG files can easily be misidentified as XML files, because they usually
// *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read
// before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg"
// as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered
// to have valid extensions.
// TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to
// "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its
// extension is classed as application/*+xml, consider it OK
[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
} else if essence == "application/msword" {
// classic office files considered harmful
vec![String::from("doc"), String::from("xls"), String::from("ppt")]
} else if essence == "application/zip" {
// neither xdg-mime nor infer seem to be able to detect office XML files properly...
[vec![String::from("zip"), String::from("docx"), String::from("xlsx"), String::from("pptx")], possible_exts].concat()
} else if essence == "application/x-ms-dos-executable" {
// both .dll and .exe files are given the same mime type... but you definitely don't want to rename one to the
// other!
[vec![String::from("dll"), String::from("exe")], possible_exts].concat()
} else {
possible_exts
})
},
None => None
// Returns a list of known extensions for this mime type, if any.
// This function uses the [Mime]'s "essence" rather than the [Mime] itself - mime_guess::get_mime_extensions ignores
// the type suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. Passing the
// essence_str (which includes the suffix) fixes this.
pub fn mime_extension_lookup(essence: String) -> Option<Vec<String>> {
if let Ok(cache) = MIMEXT.read() {
if let Some(exts) = cache.get(&essence) {
return exts.clone();
}
}
let essence = essence;
let mut exts = mime_guess::get_mime_extensions_str(essence.as_str());
if exts.is_none() {
// no matches :c
// mime_guess' database isn't exactly perfect... there are a lot of times where the db will return "some/x-thing"
// but mime_guess only understands "some/thing", or vice-versa.
// so, if there appear to be no extensions, try replacing "some/x-thing" with "some/thing", or "some/thing" with
// "some/x-thing".
if essence.contains("/x-") {
// replace e.g. "application/x-gzip" with "application/gzip"
exts = mime_guess::get_mime_extensions_str(&essence.replace("/x-", "/"));
} else {
// replace e.g. "video/mp2t" with "video/x-mp2t"
exts = mime_guess::get_mime_extensions_str(&essence.replace("/", "/x-"));
}
}
let exts = match exts {
Some(exts) => {
let possible_exts: Vec<String> = exts.iter().map(|e| String::from(*e)).collect();
Some(if essence == mime::IMAGE_JPEG.essence_str() {
// possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are
// far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can
// add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif.
[vec![String::from("jpg")], possible_exts].concat()
} else if essence == mime::TEXT_XML.essence_str() || essence == "application/xml" {
// a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should
// (in my opinion) be "xml".
// there's also another problem: SVG files can easily be misidentified as XML files, because they usually
// *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read
// before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg"
// as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered
// to have valid extensions.
// TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to
// "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its
// extension is classed as application/*+xml, consider it OK
[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
} else if essence == "application/msword" {
// classic office files considered harmful
vec![String::from("doc"), String::from("xls"), String::from("ppt")]
} else if essence == "application/zip" {
// neither xdg-mime nor infer seem to be able to detect office XML files properly...
[
vec![
String::from("zip"),
String::from("docx"),
String::from("xlsx"),
String::from("pptx"),
],
possible_exts,
]
.concat()
} else if essence == "application/x-ms-dos-executable" {
// both .dll and .exe files are given the same mime type... but you definitely don't want to rename one to the
// other!
[vec![String::from("dll"), String::from("exe")], possible_exts].concat()
} else {
possible_exts
})
}
None => None,
};
if let Ok(mut cache) = MIMEXT.write() {
cache.insert(essence, exts.clone());
exts
} else {
unreachable!()
}
}