replace cached dep with custom hashmap thing
i benchmarked it with hyperfine and in terms of performance it's pretty much identical, with a slight (fraction of a percent) advantage to my implementation
This commit is contained in:
parent
3d41183f1c
commit
97b0a6edaa
3 changed files with 86 additions and 70 deletions
|
@ -16,6 +16,7 @@ files module, removed string module, etc.
|
||||||
"Features" heading into "Added" and "Changed" sections, renaming "Bugfixes" to "Fixed", and removing the
|
"Features" heading into "Added" and "Changed" sections, renaming "Bugfixes" to "Fixed", and removing the
|
||||||
headings that (pointlessly?) previously divided the changelog into v0.3, v0.2, and v0.1
|
headings that (pointlessly?) previously divided the changelog into v0.3, v0.2, and v0.1
|
||||||
- A few minor grammar tweaks and reorganisations
|
- A few minor grammar tweaks and reorganisations
|
||||||
|
- Replaced [`cached`] dependency with a simple HashMap-backed store
|
||||||
|
|
||||||
## v0.3.6 - 2021-08-16
|
## v0.3.6 - 2021-08-16
|
||||||
### Other
|
### Other
|
||||||
|
|
|
@ -94,4 +94,4 @@ opt-level = 3
|
||||||
opt-level = 3
|
opt-level = 3
|
||||||
|
|
||||||
[package.metadata]
|
[package.metadata]
|
||||||
msrv = "1.43.0"
|
msrv = "1.43.0"
|
||||||
|
|
153
src/files.rs
153
src/files.rs
|
@ -1,15 +1,16 @@
|
||||||
use std::collections::BTreeSet;
|
use std::collections::{BTreeSet, HashMap};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::io::{Read, Seek, SeekFrom};
|
use std::io::{Read, Seek, SeekFrom};
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
use std::sync::RwLock;
|
||||||
|
|
||||||
use cached::cached;
|
|
||||||
use cfg_if::cfg_if;
|
use cfg_if::cfg_if;
|
||||||
use log::{debug, error};
|
use log::{debug, error};
|
||||||
use mime::Mime;
|
use mime::Mime;
|
||||||
use mime_guess::from_ext;
|
use mime_guess::from_ext;
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
use walkdir::{DirEntry, WalkDir};
|
use walkdir::{DirEntry, WalkDir};
|
||||||
|
|
||||||
use crate::findings::{Findings, ScanError};
|
use crate::findings::{Findings, ScanError};
|
||||||
|
@ -17,6 +18,8 @@ use crate::mime_db::MimeDb;
|
||||||
use crate::parameters::ScanOpts;
|
use crate::parameters::ScanOpts;
|
||||||
use crate::{String, MIMEDB};
|
use crate::{String, MIMEDB};
|
||||||
|
|
||||||
|
static MIMEXT: Lazy<RwLock<HashMap<String, Option<Vec<String>>>>> = Lazy::new(|| RwLock::new(HashMap::new()));
|
||||||
|
|
||||||
cfg_if! {
|
cfg_if! {
|
||||||
if #[cfg(windows)] {
|
if #[cfg(windows)] {
|
||||||
/// Determines whether or not a file is hidden by checking its win32 file attributes.
|
/// Determines whether or not a file is hidden by checking its win32 file attributes.
|
||||||
|
@ -255,73 +258,85 @@ pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
|
||||||
Ok(db.get_type(&buffer))
|
Ok(db.get_type(&buffer))
|
||||||
}
|
}
|
||||||
|
|
||||||
cached! {
|
// Returns a list of known extensions for this mime type, if any.
|
||||||
MIMEXT;
|
// This function uses the [Mime]'s "essence" rather than the [Mime] itself - mime_guess::get_mime_extensions ignores
|
||||||
fn mime_extension_lookup(essence: String) -> Option<Vec<String>> = {
|
// the type suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. Passing the
|
||||||
// Returns a list of known extensions for this mime type, if any.
|
// essence_str (which includes the suffix) fixes this.
|
||||||
// This function uses the [Mime]'s "essence" rather than the [Mime] itself - mime_guess::get_mime_extensions ignores
|
pub fn mime_extension_lookup(essence: String) -> Option<Vec<String>> {
|
||||||
// the type suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. Passing the
|
if let Ok(cache) = MIMEXT.read() {
|
||||||
// essence_str (which includes the suffix) fixes this.
|
if let Some(exts) = cache.get(&essence) {
|
||||||
// ↑ this is supposed to be a doc comment, but the cached! macro doesn't support that... i would switch to the
|
return exts.clone();
|
||||||
// proc_macro version of cached, but it has a huge number of deps :c
|
|
||||||
|
|
||||||
let essence = essence.as_str();
|
|
||||||
let mut exts = mime_guess::get_mime_extensions_str(essence);
|
|
||||||
if exts.is_none() {
|
|
||||||
// no matches :c
|
|
||||||
// mime_guess' database isn't exactly perfect... there are a lot of times where the db will return "some/x-thing"
|
|
||||||
// but mime_guess only understands "some/thing", or vice-versa.
|
|
||||||
// so, if there appear to be no extensions, try replacing "some/x-thing" with "some/thing", or "some/thing" with
|
|
||||||
// "some/x-thing".
|
|
||||||
if essence.contains("/x-") {
|
|
||||||
// replace e.g. "application/x-gzip" with "application/gzip"
|
|
||||||
exts = mime_guess::get_mime_extensions_str(&essence.replace("/x-", "/"));
|
|
||||||
} else {
|
|
||||||
// replace e.g. "video/mp2t" with "video/x-mp2t"
|
|
||||||
exts = mime_guess::get_mime_extensions_str(&essence.replace("/", "/x-"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
match exts {
|
|
||||||
Some(exts) => {
|
|
||||||
let possible_exts: Vec<String> = exts.iter().map(|e| String::from(*e)).collect();
|
|
||||||
|
|
||||||
Some(if essence == mime::IMAGE_JPEG.essence_str() {
|
|
||||||
// possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are
|
|
||||||
// far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can
|
|
||||||
// add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif.
|
|
||||||
[vec![String::from("jpg")], possible_exts].concat()
|
|
||||||
|
|
||||||
} else if essence == mime::TEXT_XML.essence_str() || essence == "application/xml" {
|
|
||||||
// a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should
|
|
||||||
// (in my opinion) be "xml".
|
|
||||||
// there's also another problem: SVG files can easily be misidentified as XML files, because they usually
|
|
||||||
// *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read
|
|
||||||
// before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg"
|
|
||||||
// as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered
|
|
||||||
// to have valid extensions.
|
|
||||||
// TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to
|
|
||||||
// "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its
|
|
||||||
// extension is classed as application/*+xml, consider it OK
|
|
||||||
[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
|
|
||||||
|
|
||||||
} else if essence == "application/msword" {
|
|
||||||
// classic office files considered harmful
|
|
||||||
vec![String::from("doc"), String::from("xls"), String::from("ppt")]
|
|
||||||
|
|
||||||
} else if essence == "application/zip" {
|
|
||||||
// neither xdg-mime nor infer seem to be able to detect office XML files properly...
|
|
||||||
[vec![String::from("zip"), String::from("docx"), String::from("xlsx"), String::from("pptx")], possible_exts].concat()
|
|
||||||
|
|
||||||
} else if essence == "application/x-ms-dos-executable" {
|
|
||||||
// both .dll and .exe files are given the same mime type... but you definitely don't want to rename one to the
|
|
||||||
// other!
|
|
||||||
[vec![String::from("dll"), String::from("exe")], possible_exts].concat()
|
|
||||||
} else {
|
|
||||||
possible_exts
|
|
||||||
})
|
|
||||||
},
|
|
||||||
None => None
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let essence = essence;
|
||||||
|
let mut exts = mime_guess::get_mime_extensions_str(essence.as_str());
|
||||||
|
if exts.is_none() {
|
||||||
|
// no matches :c
|
||||||
|
// mime_guess' database isn't exactly perfect... there are a lot of times where the db will return "some/x-thing"
|
||||||
|
// but mime_guess only understands "some/thing", or vice-versa.
|
||||||
|
// so, if there appear to be no extensions, try replacing "some/x-thing" with "some/thing", or "some/thing" with
|
||||||
|
// "some/x-thing".
|
||||||
|
if essence.contains("/x-") {
|
||||||
|
// replace e.g. "application/x-gzip" with "application/gzip"
|
||||||
|
exts = mime_guess::get_mime_extensions_str(&essence.replace("/x-", "/"));
|
||||||
|
} else {
|
||||||
|
// replace e.g. "video/mp2t" with "video/x-mp2t"
|
||||||
|
exts = mime_guess::get_mime_extensions_str(&essence.replace("/", "/x-"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let exts = match exts {
|
||||||
|
Some(exts) => {
|
||||||
|
let possible_exts: Vec<String> = exts.iter().map(|e| String::from(*e)).collect();
|
||||||
|
|
||||||
|
Some(if essence == mime::IMAGE_JPEG.essence_str() {
|
||||||
|
// possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are
|
||||||
|
// far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can
|
||||||
|
// add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif.
|
||||||
|
[vec![String::from("jpg")], possible_exts].concat()
|
||||||
|
} else if essence == mime::TEXT_XML.essence_str() || essence == "application/xml" {
|
||||||
|
// a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should
|
||||||
|
// (in my opinion) be "xml".
|
||||||
|
// there's also another problem: SVG files can easily be misidentified as XML files, because they usually
|
||||||
|
// *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read
|
||||||
|
// before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg"
|
||||||
|
// as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered
|
||||||
|
// to have valid extensions.
|
||||||
|
// TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to
|
||||||
|
// "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its
|
||||||
|
// extension is classed as application/*+xml, consider it OK
|
||||||
|
[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
|
||||||
|
} else if essence == "application/msword" {
|
||||||
|
// classic office files considered harmful
|
||||||
|
vec![String::from("doc"), String::from("xls"), String::from("ppt")]
|
||||||
|
} else if essence == "application/zip" {
|
||||||
|
// neither xdg-mime nor infer seem to be able to detect office XML files properly...
|
||||||
|
[
|
||||||
|
vec![
|
||||||
|
String::from("zip"),
|
||||||
|
String::from("docx"),
|
||||||
|
String::from("xlsx"),
|
||||||
|
String::from("pptx"),
|
||||||
|
],
|
||||||
|
possible_exts,
|
||||||
|
]
|
||||||
|
.concat()
|
||||||
|
} else if essence == "application/x-ms-dos-executable" {
|
||||||
|
// both .dll and .exe files are given the same mime type... but you definitely don't want to rename one to the
|
||||||
|
// other!
|
||||||
|
[vec![String::from("dll"), String::from("exe")], possible_exts].concat()
|
||||||
|
} else {
|
||||||
|
possible_exts
|
||||||
|
})
|
||||||
|
}
|
||||||
|
None => None,
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Ok(mut cache) = MIMEXT.write() {
|
||||||
|
cache.insert(essence, exts.clone());
|
||||||
|
exts
|
||||||
|
} else {
|
||||||
|
unreachable!()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue