fif/src/inspectors.rs at 5907309689f264c5f14a04aaea831258b8fd0dd3

Lynnesbian 5907309689 renamed modules in accordance with https://rust-lang.github.io/api-guidelines/naming.html

2021-03-01 19:21:00 +10:00

110 lines

5.4 KiB

Rust

Raw Blame History

 //! Functions for getting the mime type and extension of a file.
 use std::fs::File;
 use std::io;
 use std::io::{Read, Seek, SeekFrom};
 use std::path::Path;
 use std::str::FromStr;
 use cached::cached;
 use mime_guess::Mime;
 use smartstring::alias::String;
 use crate::mime_db::MimeDb;
 /// The number of bytes to read initially.
 ///
 /// Rather than reading the entire file all at once into a [BUF_SIZE] buffer, it tends to be faster to read a small
 /// chunk of the file and trying to identify that, proceeding with the larger buffer if that fails. Many file formats
 /// can be identified with the first few dozen bytes, so the "happy path" will likely be taken in the majority of cases.
 const INITIAL_BUF_SIZE: usize = 128;
 /// The number of bytes to read if the file couldn't be identified from its first [INITIAL_BUF_SIZE] bytes.
 const BUF_SIZE: usize = 4096;
 /// Tries to identify the mimetype of a file from a given path.
 pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
 	let mut buffer = [0; INITIAL_BUF_SIZE];
 	let mut file = File::open(path)?;
 	// this lint can be ignored: it's okay if the file isn't long enough to fill the buffer, as we only care about the
 	// first few bytes for the purpose of mime sniffing
 	#[allow(clippy::unused_io_amount)]
 	file.read(&mut buffer)?;
 	let r = db.get_type(&buffer).filter(|mime|
 		// some mime types should be investigated further, reading up to BUF_SIZE even if they've been determined already
 		// one such type is XML - there's many more specific types that can be determined by reading further (such as SVG)
 		mime != &mime_guess::mime::TEXT_XML
 		// another is ZIP - many file formats (DOCX, ODT, JAR...) are just ZIP files with particular data structures.
 		// determining that a file is in one of the MS office formats in particular requires looking quite far into the
 		// file.
 		&& mime != &Mime::from_str("application/zip").unwrap());
 	if r.is_some() {
 		return Ok(r);
 	}
 	// attempt to read up to the BUF_SIZE bytes of the file.
 	// we've already read the first 128 bytes into a buffer, but i can't see an obvious way to reuse them in a way that's
 	// faster than simply moving the seek position back to the start of the file and re-reading the whole BUF_SIZE bytes.
 	// for example, reading (BUF_SIZE - INITIAL_BUF_SIZE) bytes, then concatenating the original INITIAL_BUF_SIZE buffer
 	// with this new one would presumably be faster - but it's not. i think it's more expensive to create the iterator,
 	// collect the contents, etc. i'll have to look into this more. i don't at all doubt that there IS a way to do this
 	// efficiently, and i can think of a way in principle, but i'm not sure how to express it in a way that is both
 	// idiomatic/safe and fast.
 	let mut buffer = [0; BUF_SIZE];
 	file.seek(SeekFrom::Start(0))?;
 	file.read(&mut buffer)?;
 	Ok(db.get_type(&buffer))
 }
 // TODO: avoid cloning mime if possible, although i don't really see how it would be - maybe instead of passing the mime
 // object, pass a hash of it?
 cached! {
 	MIMEXT;
 	fn mime_extension_lookup(mime: Mime) -> Option<Vec<String>> = {
 		// Returns a list of known extensions for this mime type, if any.
 		// ↑ this is supposed to be a doc comment, but the cached! macro doesn't support that... maybe i should switch to
 		// the derive macro
 		// match on the mime's `essence_str` rather than the mime itself - mime_guess::get_mime_extensions ignores the type
 		// suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. passing the essence_str
 		// (which includes the suffix) fixes this.
 		match mime_guess::get_mime_extensions_str(mime.essence_str()) {
 			Some(exts) => {
 				let possible_exts: Vec<String> = exts.iter().map(|e| String::from(*e)).collect();
 				Some(if mime == mime_guess::mime::IMAGE_JPEG {
 					// possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are
 					// far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can
 					// add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif.
 					[vec![String::from("jpg")], possible_exts].concat()
 				} else if mime == mime_guess::mime::TEXT_XML {
 					// a somewhat similar case arises with XML files - the first suggested extension is "addin", when it should
 					// (in my opinion) be "xml".
 					// there's also another problem: SVG files can easily be misidentified as XML files, because they usually
 					// *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read
 					// before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg"
 					// as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered
 					// to have valid extensions.
 					[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
 				} else if mime == Mime::from_str("application/msword").unwrap() {
 					// classic office files considered harmful
 					vec![String::from("doc"), String::from("xls"), String::from("ppt")]
 				} else if mime == Mime::from_str("application/zip").unwrap() {
 					// neither xdg-mime nor infer seem to be able to detect office XML files properly...
 					[vec![String::from("zip"), String::from("docx"), String::from("xlsx"), String::from("pptx")], possible_exts].concat()
 				} else {
 					possible_exts
 				})
 			},
 			None => None
 		}
 	}
 }

110 lines 5.4 KiB Rust Raw Blame History

110 lines

5.4 KiB

Rust

Raw Blame History