From 426e09fb05f206f93a9568afc4dcec67d223d4b9 Mon Sep 17 00:00:00 2001 From: Lynne Date: Sun, 21 Feb 2021 21:30:58 +1000 Subject: [PATCH] SVG support, better and more comments, minor code cleanup --- src/formats.rs | 6 ++--- src/inspectors.rs | 59 ++++++++++++++++++++++++++++++++++++----------- src/main.rs | 12 +++++----- src/mimedb.rs | 58 ++++++++++++++++++++++++++++++++-------------- 4 files changed, 96 insertions(+), 39 deletions(-) diff --git a/src/formats.rs b/src/formats.rs index 095b8ef..258542e 100644 --- a/src/formats.rs +++ b/src/formats.rs @@ -89,9 +89,9 @@ impl Format for Script { } fn no_known_extension(&self, f: &mut W, path: &PathBuf) -> io::Result<()> { - write!(f, "printf No known extension for ")?; + write!(f, "echo No known extension for ")?; write_pathbuf(f, path)?; - writeln!(f,"\nprintf '\n'") + writeln!(f, ) } fn unreadable(&self, f: &mut W, path: &PathBuf) -> io::Result<()> { @@ -115,6 +115,6 @@ impl Format for Script { } fn footer(&self, _: &Entries, f: &mut W) -> io::Result<()> { - writeln!(f, "\nprintf 'Done.\\n'") + writeln!(f, "\necho 'Done.'") } } diff --git a/src/inspectors.rs b/src/inspectors.rs index fb66d3f..7cdbbaa 100644 --- a/src/inspectors.rs +++ b/src/inspectors.rs @@ -17,30 +17,42 @@ use crate::mimedb::MimeDb; // unfortunately, the format that requires 262 bytes for identification is tar, an extremely popular format (in the *nix // world, at least). however, tar files almost always appear wrapped in other formats (.tar.gz, .tar.zst, etc) anyway, // so maybe it's fine...? maybe this should be configurable by the user? i don't know. -// empirical testing (or rather, starting from 256 and incrementing until it worked) reveals that mime_type requires +// empirical testing (or rather, starting from 256 and incrementing until it worked) reveals that xdg-mime requires // at least 265 bytes to identify a tar file. + +// additionally, since many formats can by identified with ≤64 bytes, it's worth reading 64 bytes, checking for the mime +// type, and then reading the full 512 bytes if necessary. in most cases, this will end up being faster on the whole, +// even though two reads are needed for certain formats, unless the directory being scanned is predominantly made up of +// such formats. + +const INITIAL_BUF_SIZE: usize = 64; const BUF_SIZE: usize = 512; pub fn mime_type(db: &T, path: &Path) -> io::Result> { - // attempt to read up to the BUF_SIZE bytes of the file - - let mut buffer = [0; 64]; + let mut buffer = [0; INITIAL_BUF_SIZE]; let mut file = File::open(path)?; // this lint can be ignored: it's okay if the file isn't long enough to fill the buffer, as we only care about the // first few bytes for the purpose of mime sniffing #[allow(clippy::unused_io_amount)] - file.read(&mut buffer)?; + file.read(&mut buffer)?; let r = db.get_type(&buffer); + if r.is_some() { return Ok(r); } + // attempt to read up to the BUF_SIZE bytes of the file. + // we've already read the first 64 bytes into a buffer, but i can't see an obvious way to reuse those 64 bytes that's + // faster than simply moving the seek position back to the start of the file and re-reading the whole 512 bytes. + // for example, starting with a buffer of 64 bytes, then creating a new 512 byte buffer from the contents of the first + // buffer with (512 - 64) blank bytes, then finally reading the rest, is much slower than simply reading the file + // twice. i don't at all doubt that there IS a way to do this efficiently, and i can think of a way in principle, but + // i'm not sure how to express it in a way that is both idiomatic/safe and fast. let mut buffer = [0; BUF_SIZE]; file.seek(SeekFrom::Start(0))?; file.read(&mut buffer)?; - // warn!("dang"); Ok(db.get_type(&buffer)) } @@ -49,13 +61,34 @@ pub fn mime_type(db: &T, path: &Path) -> io::Result> { cached! { MIMEXT; fn mime_extension_lookup(mime: Mime) -> Option> = { - if mime == mime_guess::mime::IMAGE_JPEG { - // jpeg files are given the primary extension "jpe", due to the extension list being stored in alphabetical order. - // to handle this particular case, return a custom vector consisting of just "jpg" and "jpeg". - return Some(vec![String::from("jpg"), String::from("jpeg")]); - } - match mime_guess::get_mime_extensions(&mime) { // get a list of possible extensions for this mime type - Some(exts) => Some(exts.iter().map(|e| String::from(*e)).collect()), + + // match on the mime's `essence_str` rather than the mime itself - mime_guess::get_mime_extensions ignores the type + // suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. passing the essence_str + // (which includes the suffix) fixes this. + match mime_guess::get_mime_extensions_str(mime.essence_str()) { + Some(exts) => { + let possible_exts: Vec = exts.iter().map(|e| String::from(*e)).collect(); + + Some(if mime == mime_guess::mime::IMAGE_JPEG { + // possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are + // far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can + // add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif. + [vec![String::from("jpg")], possible_exts].concat() + + } else if mime == mime_guess::mime::TEXT_XML { + // a somewhat similar case arises with XML files - the first suggested extension is "addin", when it should + // (in my opinion) be "xml". + // there's also another problem: SVG files can easily be misidentified as XML files, because they usually + // *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read + // before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg" + // as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered + // to have valid extensions. + [vec![String::from("xml"), String::from("svg")], possible_exts].concat() + + } else { + possible_exts + }) + }, None => None } } diff --git a/src/main.rs b/src/main.rs index 11f2f55..b3a64e9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -118,13 +118,13 @@ fn scan_file(entry: &DirEntry) -> Result { Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_lowercase().into()), // there is a known set of extensions for this mimetype, but the file has no extension Some(_) => false, - // there is no known set of extensions for this mimetype -- assume it's correct - None => true, + // there is no known set of extensions for this mimetype :( + None => false, }; Ok(Findings { file: entry.path().to_path_buf(), - valid, // make this a function + valid, mime: result, }) } @@ -165,13 +165,13 @@ fn main() { #[cfg(feature = "infer-backend")] MIMEDB .set(mimedb::InferDb::init()) - .or(Err("Failed to initialise MIMEDB")) + .or(Err("Failed to initialise Infer backend!")) .unwrap(); #[cfg(feature = "xdg-mime-backend")] MIMEDB .set(mimedb::XdgDb::init()) - .or(Err("Failed to initialise MIMEDB")) + .or(Err("Failed to initialise XDG Mime backend!")) .unwrap(); debug!("Iterating directory: {:?}", args.dirs); @@ -207,7 +207,7 @@ fn main() { info!( "{:?} should have file extension {}", r.file, - r.recommended_extension().unwrap() + r.recommended_extension().unwrap_or("???".into()) ) } else { trace!("{:?} is totally fine", r.file) diff --git a/src/mimedb.rs b/src/mimedb.rs index c080718..abaf476 100644 --- a/src/mimedb.rs +++ b/src/mimedb.rs @@ -17,25 +17,49 @@ pub struct InferDb { impl MimeDb for InferDb { fn init() -> Self { let mut info = infer::Infer::new(); - // add a random file type just to make sure adding works and such + + // jpeg2000 support because why the stinch not info.add("image/jpeg2000", ".jp2", |buf| { buf.len() > 23 - && buf[0] == 0x00 - && buf[1] == 0x00 - && buf[2] == 0x00 - && buf[3] == 0x0C - && buf[4] == 0x6A - && buf[5] == 0x50 - && buf[6] == 0x20 - && buf[7] == 0x20 - && buf[8] == 0x0D - && buf[9] == 0x0A - && buf[10] == 0x87 - && buf[11] == 0x0A - && buf[20] == 0x6A - && buf[21] == 0x70 - && buf[22] == 0x32 - && buf[23] == 0x20 + && buf[..23] == b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A\x6A\x70\x32\x20"[..] + }); + + info.add("image/svg+xml", "svg", |buf| { + // before doing the moderately expensive SVG check, we should make sure that the input is actually SGML-ish + // by "SGML-ish", i mean starts with anywhere from zero to ∞-1 whitespace characters, and then a less than sign, + // and then there's some other stuff we don't care about right now + + // so, here comes our fancy pants """""SGML-ish validator""""" + for i in 0..buf.len() { + match buf[i] { + // whitespace (according to https://www.w3.org/TR/xml/#NT-S) + b'\t' | b'\r' | b'\n' | b'\x20' => continue, + b'<' => break, + _ => return false + } + } + + // finally, to check whether or not the file is an SVG: + // - split the buffer up into chunks separated by the less than sign + // - check to see if this chunk starts with any of these identifiers: + let identifiers: Vec<&[u8]> = vec![ + "svg".as_bytes(), + "SVG".as_bytes(), + "!DOCTYPE svg".as_bytes(), + "!DOCTYPE SVG".as_bytes() + ]; + // - if it does, the nested `any` will short circuit and immediately return true, causing the parent `any` to do + // the same + // - and finally, if none of the chunks match, we'll return false + + // TODO: this is kind of messy, i'd like to clean it up somehow :( + buf + .split(|c| *c == b'<') + .any(|buf| { + identifiers + .iter() + .any(|id| buf.starts_with(id)) + }) }); // unmut