2021-02-05 12:45:51 +00:00
|
|
|
use std::fs::File;
|
2021-02-14 14:30:12 +00:00
|
|
|
use std::io;
|
2021-02-06 11:48:31 +00:00
|
|
|
use std::io::{Read, Seek, SeekFrom};
|
2021-02-14 14:30:12 +00:00
|
|
|
use std::path::Path;
|
2021-02-21 16:32:38 +00:00
|
|
|
use std::str::FromStr;
|
2021-02-14 14:30:12 +00:00
|
|
|
|
|
|
|
use cached::cached;
|
|
|
|
use mime_guess::Mime;
|
2021-02-05 12:45:51 +00:00
|
|
|
use smartstring::alias::String;
|
2021-02-14 18:58:57 +00:00
|
|
|
|
|
|
|
use crate::mimedb::MimeDb;
|
2021-02-14 14:30:12 +00:00
|
|
|
|
2021-02-21 22:46:17 +00:00
|
|
|
// rather than reading once into a large buffer, it tends to be faster to first try identifying the file from a small
|
|
|
|
// chunk read from the top, and *then* proceeding with the large buffer. many file formats can be easily identified by
|
|
|
|
// the first 128 bytes. of course, not all formats can, and some (OOXML...) require reading a long ways in.
|
2021-02-21 11:30:58 +00:00
|
|
|
|
2021-02-21 22:46:17 +00:00
|
|
|
const INITIAL_BUF_SIZE: usize = 128;
|
|
|
|
const BUF_SIZE: usize = 4096;
|
2021-02-05 09:15:12 +00:00
|
|
|
|
2021-02-18 09:48:38 +00:00
|
|
|
pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
|
2021-02-21 11:30:58 +00:00
|
|
|
let mut buffer = [0; INITIAL_BUF_SIZE];
|
2021-02-05 12:45:51 +00:00
|
|
|
let mut file = File::open(path)?;
|
2021-02-05 09:15:12 +00:00
|
|
|
|
2021-02-06 11:48:31 +00:00
|
|
|
// this lint can be ignored: it's okay if the file isn't long enough to fill the buffer, as we only care about the
|
2021-02-05 12:45:51 +00:00
|
|
|
// first few bytes for the purpose of mime sniffing
|
|
|
|
#[allow(clippy::unused_io_amount)]
|
2021-02-21 14:15:09 +00:00
|
|
|
file.read(&mut buffer)?;
|
2021-02-06 11:48:31 +00:00
|
|
|
|
2021-02-21 22:46:17 +00:00
|
|
|
let r = db.get_type(&buffer).filter(|mime|
|
|
|
|
// some mime types should be investigated further, reading up to BUF_SIZE even if they've been determined already
|
|
|
|
// one such type is XML - there's many more specific types that can be determined by reading further (such as SVG)
|
|
|
|
mime != &mime_guess::mime::TEXT_XML
|
|
|
|
// another is ZIP - many file formats (DOCX, ODT, JAR...) are just ZIP files with particular data structures.
|
|
|
|
// determining that a file is in one of the MS office formats in particular requires looking quite far into the
|
|
|
|
// file.
|
|
|
|
&& mime != &Mime::from_str("application/zip").unwrap());
|
2021-02-21 11:30:58 +00:00
|
|
|
|
2021-02-06 11:48:31 +00:00
|
|
|
if r.is_some() {
|
|
|
|
return Ok(r);
|
|
|
|
}
|
|
|
|
|
2021-02-21 11:30:58 +00:00
|
|
|
// attempt to read up to the BUF_SIZE bytes of the file.
|
2021-02-21 22:46:17 +00:00
|
|
|
// we've already read the first 128 bytes into a buffer, but i can't see an obvious way to reuse them in a way that's
|
|
|
|
// faster than simply moving the seek position back to the start of the file and re-reading the whole BUF_SIZE bytes.
|
|
|
|
// for example, reading (BUF_SIZE - INITIAL_BUF_SIZE) bytes, then concatenating the original INITIAL_BUF_SIZE buffer
|
|
|
|
// with this new one would presumably be faster - but it's not. i think it's more expensive to create the iterator,
|
|
|
|
// collect the contents, etc. i'll have to look into this more. i don't at all doubt that there IS a way to do this
|
|
|
|
// efficiently, and i can think of a way in principle, but i'm not sure how to express it in a way that is both
|
|
|
|
// idiomatic/safe and fast.
|
2021-02-06 11:48:31 +00:00
|
|
|
let mut buffer = [0; BUF_SIZE];
|
|
|
|
file.seek(SeekFrom::Start(0))?;
|
|
|
|
file.read(&mut buffer)?;
|
2021-02-14 18:58:57 +00:00
|
|
|
Ok(db.get_type(&buffer))
|
2021-02-05 12:45:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: avoid cloning mime if possible, although i don't really see how it would be - maybe instead of passing the mime
|
|
|
|
// object, pass a hash of it?
|
2021-02-14 14:30:12 +00:00
|
|
|
cached! {
|
|
|
|
MIMEXT;
|
|
|
|
fn mime_extension_lookup(mime: Mime) -> Option<Vec<String>> = {
|
2021-02-21 11:30:58 +00:00
|
|
|
|
|
|
|
// match on the mime's `essence_str` rather than the mime itself - mime_guess::get_mime_extensions ignores the type
|
|
|
|
// suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. passing the essence_str
|
|
|
|
// (which includes the suffix) fixes this.
|
|
|
|
match mime_guess::get_mime_extensions_str(mime.essence_str()) {
|
|
|
|
Some(exts) => {
|
|
|
|
let possible_exts: Vec<String> = exts.iter().map(|e| String::from(*e)).collect();
|
|
|
|
|
|
|
|
Some(if mime == mime_guess::mime::IMAGE_JPEG {
|
|
|
|
// possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are
|
|
|
|
// far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can
|
|
|
|
// add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif.
|
|
|
|
[vec![String::from("jpg")], possible_exts].concat()
|
|
|
|
|
|
|
|
} else if mime == mime_guess::mime::TEXT_XML {
|
|
|
|
// a somewhat similar case arises with XML files - the first suggested extension is "addin", when it should
|
|
|
|
// (in my opinion) be "xml".
|
|
|
|
// there's also another problem: SVG files can easily be misidentified as XML files, because they usually
|
|
|
|
// *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read
|
|
|
|
// before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg"
|
|
|
|
// as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered
|
|
|
|
// to have valid extensions.
|
|
|
|
[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
|
|
|
|
|
2021-02-21 22:46:17 +00:00
|
|
|
} else if mime == Mime::from_str("application/msword").unwrap() {
|
|
|
|
// classic office files considered harmful
|
|
|
|
vec![String::from("doc"), String::from("xls"), String::from("ppt")]
|
2021-02-27 02:02:49 +00:00
|
|
|
|
|
|
|
} else if mime == Mime::from_str("application/zip").unwrap() {
|
|
|
|
// neither xdg-mime nor infer seem to be able to detect office XML files properly...
|
|
|
|
[vec![String::from("zip"), String::from("docx"), String::from("xlsx"), String::from("pptx")], possible_exts].concat()
|
|
|
|
|
2021-02-21 11:30:58 +00:00
|
|
|
} else {
|
|
|
|
possible_exts
|
|
|
|
})
|
|
|
|
},
|
2021-02-14 14:30:12 +00:00
|
|
|
None => None
|
|
|
|
}
|
2021-02-05 12:45:51 +00:00
|
|
|
}
|
|
|
|
}
|