use std::fs::File; use std::io; use std::io::{Read, Seek, SeekFrom}; use std::path::Path; use std::str::FromStr; use cached::cached; use mime_guess::Mime; use smartstring::alias::String; use crate::mimedb::MimeDb; // use log::{debug, warn}; // rather than reading once into a large buffer, it tends to be faster to first try identifying the file from a small // chunk read from the top, and *then* proceeding with the large buffer. many file formats can be easily identified by // the first 128 bytes. of course, not all formats can, and some (OOXML...) require reading a long ways in. const INITIAL_BUF_SIZE: usize = 128; const BUF_SIZE: usize = 4096; pub fn mime_type(db: &T, path: &Path) -> io::Result> { let mut buffer = [0; INITIAL_BUF_SIZE]; let mut file = File::open(path)?; // this lint can be ignored: it's okay if the file isn't long enough to fill the buffer, as we only care about the // first few bytes for the purpose of mime sniffing #[allow(clippy::unused_io_amount)] file.read(&mut buffer)?; let r = db.get_type(&buffer).filter(|mime| // some mime types should be investigated further, reading up to BUF_SIZE even if they've been determined already // one such type is XML - there's many more specific types that can be determined by reading further (such as SVG) mime != &mime_guess::mime::TEXT_XML // another is ZIP - many file formats (DOCX, ODT, JAR...) are just ZIP files with particular data structures. // determining that a file is in one of the MS office formats in particular requires looking quite far into the // file. && mime != &Mime::from_str("application/zip").unwrap()); if r.is_some() { return Ok(r); } // attempt to read up to the BUF_SIZE bytes of the file. // we've already read the first 128 bytes into a buffer, but i can't see an obvious way to reuse them in a way that's // faster than simply moving the seek position back to the start of the file and re-reading the whole BUF_SIZE bytes. // for example, reading (BUF_SIZE - INITIAL_BUF_SIZE) bytes, then concatenating the original INITIAL_BUF_SIZE buffer // with this new one would presumably be faster - but it's not. i think it's more expensive to create the iterator, // collect the contents, etc. i'll have to look into this more. i don't at all doubt that there IS a way to do this // efficiently, and i can think of a way in principle, but i'm not sure how to express it in a way that is both // idiomatic/safe and fast. let mut buffer = [0; BUF_SIZE]; file.seek(SeekFrom::Start(0))?; file.read(&mut buffer)?; Ok(db.get_type(&buffer)) } // TODO: avoid cloning mime if possible, although i don't really see how it would be - maybe instead of passing the mime // object, pass a hash of it? cached! { MIMEXT; fn mime_extension_lookup(mime: Mime) -> Option> = { // match on the mime's `essence_str` rather than the mime itself - mime_guess::get_mime_extensions ignores the type // suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. passing the essence_str // (which includes the suffix) fixes this. match mime_guess::get_mime_extensions_str(mime.essence_str()) { Some(exts) => { let possible_exts: Vec = exts.iter().map(|e| String::from(*e)).collect(); Some(if mime == mime_guess::mime::IMAGE_JPEG { // possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are // far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can // add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif. [vec![String::from("jpg")], possible_exts].concat() } else if mime == mime_guess::mime::TEXT_XML { // a somewhat similar case arises with XML files - the first suggested extension is "addin", when it should // (in my opinion) be "xml". // there's also another problem: SVG files can easily be misidentified as XML files, because they usually // *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read // before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg" // as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered // to have valid extensions. [vec![String::from("xml"), String::from("svg")], possible_exts].concat() } else if mime == Mime::from_str("application/msword").unwrap() { // classic office files considered harmful vec![String::from("doc"), String::from("xls"), String::from("ppt")] } else { possible_exts }) }, None => None } } }