SVG support, better and more comments, minor code cleanup
This commit is contained in:
parent
21fb26e3da
commit
426e09fb05
4 changed files with 96 additions and 39 deletions
|
@ -89,9 +89,9 @@ impl Format for Script {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn no_known_extension<W: Write>(&self, f: &mut W, path: &PathBuf) -> io::Result<()> {
|
fn no_known_extension<W: Write>(&self, f: &mut W, path: &PathBuf) -> io::Result<()> {
|
||||||
write!(f, "printf No known extension for ")?;
|
write!(f, "echo No known extension for ")?;
|
||||||
write_pathbuf(f, path)?;
|
write_pathbuf(f, path)?;
|
||||||
writeln!(f,"\nprintf '\n'")
|
writeln!(f, )
|
||||||
}
|
}
|
||||||
|
|
||||||
fn unreadable<W: Write>(&self, f: &mut W, path: &PathBuf) -> io::Result<()> {
|
fn unreadable<W: Write>(&self, f: &mut W, path: &PathBuf) -> io::Result<()> {
|
||||||
|
@ -115,6 +115,6 @@ impl Format for Script {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn footer<W: Write>(&self, _: &Entries, f: &mut W) -> io::Result<()> {
|
fn footer<W: Write>(&self, _: &Entries, f: &mut W) -> io::Result<()> {
|
||||||
writeln!(f, "\nprintf 'Done.\\n'")
|
writeln!(f, "\necho 'Done.'")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,14 +17,19 @@ use crate::mimedb::MimeDb;
|
||||||
// unfortunately, the format that requires 262 bytes for identification is tar, an extremely popular format (in the *nix
|
// unfortunately, the format that requires 262 bytes for identification is tar, an extremely popular format (in the *nix
|
||||||
// world, at least). however, tar files almost always appear wrapped in other formats (.tar.gz, .tar.zst, etc) anyway,
|
// world, at least). however, tar files almost always appear wrapped in other formats (.tar.gz, .tar.zst, etc) anyway,
|
||||||
// so maybe it's fine...? maybe this should be configurable by the user? i don't know.
|
// so maybe it's fine...? maybe this should be configurable by the user? i don't know.
|
||||||
// empirical testing (or rather, starting from 256 and incrementing until it worked) reveals that mime_type requires
|
// empirical testing (or rather, starting from 256 and incrementing until it worked) reveals that xdg-mime requires
|
||||||
// at least 265 bytes to identify a tar file.
|
// at least 265 bytes to identify a tar file.
|
||||||
|
|
||||||
|
// additionally, since many formats can by identified with ≤64 bytes, it's worth reading 64 bytes, checking for the mime
|
||||||
|
// type, and then reading the full 512 bytes if necessary. in most cases, this will end up being faster on the whole,
|
||||||
|
// even though two reads are needed for certain formats, unless the directory being scanned is predominantly made up of
|
||||||
|
// such formats.
|
||||||
|
|
||||||
|
const INITIAL_BUF_SIZE: usize = 64;
|
||||||
const BUF_SIZE: usize = 512;
|
const BUF_SIZE: usize = 512;
|
||||||
|
|
||||||
pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
|
pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
|
||||||
// attempt to read up to the BUF_SIZE bytes of the file
|
let mut buffer = [0; INITIAL_BUF_SIZE];
|
||||||
|
|
||||||
let mut buffer = [0; 64];
|
|
||||||
let mut file = File::open(path)?;
|
let mut file = File::open(path)?;
|
||||||
|
|
||||||
// this lint can be ignored: it's okay if the file isn't long enough to fill the buffer, as we only care about the
|
// this lint can be ignored: it's okay if the file isn't long enough to fill the buffer, as we only care about the
|
||||||
|
@ -33,14 +38,21 @@ pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
|
||||||
file.read(&mut buffer)?;
|
file.read(&mut buffer)?;
|
||||||
|
|
||||||
let r = db.get_type(&buffer);
|
let r = db.get_type(&buffer);
|
||||||
|
|
||||||
if r.is_some() {
|
if r.is_some() {
|
||||||
return Ok(r);
|
return Ok(r);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// attempt to read up to the BUF_SIZE bytes of the file.
|
||||||
|
// we've already read the first 64 bytes into a buffer, but i can't see an obvious way to reuse those 64 bytes that's
|
||||||
|
// faster than simply moving the seek position back to the start of the file and re-reading the whole 512 bytes.
|
||||||
|
// for example, starting with a buffer of 64 bytes, then creating a new 512 byte buffer from the contents of the first
|
||||||
|
// buffer with (512 - 64) blank bytes, then finally reading the rest, is much slower than simply reading the file
|
||||||
|
// twice. i don't at all doubt that there IS a way to do this efficiently, and i can think of a way in principle, but
|
||||||
|
// i'm not sure how to express it in a way that is both idiomatic/safe and fast.
|
||||||
let mut buffer = [0; BUF_SIZE];
|
let mut buffer = [0; BUF_SIZE];
|
||||||
file.seek(SeekFrom::Start(0))?;
|
file.seek(SeekFrom::Start(0))?;
|
||||||
file.read(&mut buffer)?;
|
file.read(&mut buffer)?;
|
||||||
// warn!("dang");
|
|
||||||
Ok(db.get_type(&buffer))
|
Ok(db.get_type(&buffer))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -49,13 +61,34 @@ pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
|
||||||
cached! {
|
cached! {
|
||||||
MIMEXT;
|
MIMEXT;
|
||||||
fn mime_extension_lookup(mime: Mime) -> Option<Vec<String>> = {
|
fn mime_extension_lookup(mime: Mime) -> Option<Vec<String>> = {
|
||||||
if mime == mime_guess::mime::IMAGE_JPEG {
|
|
||||||
// jpeg files are given the primary extension "jpe", due to the extension list being stored in alphabetical order.
|
// match on the mime's `essence_str` rather than the mime itself - mime_guess::get_mime_extensions ignores the type
|
||||||
// to handle this particular case, return a custom vector consisting of just "jpg" and "jpeg".
|
// suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. passing the essence_str
|
||||||
return Some(vec![String::from("jpg"), String::from("jpeg")]);
|
// (which includes the suffix) fixes this.
|
||||||
}
|
match mime_guess::get_mime_extensions_str(mime.essence_str()) {
|
||||||
match mime_guess::get_mime_extensions(&mime) { // get a list of possible extensions for this mime type
|
Some(exts) => {
|
||||||
Some(exts) => Some(exts.iter().map(|e| String::from(*e)).collect()),
|
let possible_exts: Vec<String> = exts.iter().map(|e| String::from(*e)).collect();
|
||||||
|
|
||||||
|
Some(if mime == mime_guess::mime::IMAGE_JPEG {
|
||||||
|
// possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are
|
||||||
|
// far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can
|
||||||
|
// add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif.
|
||||||
|
[vec![String::from("jpg")], possible_exts].concat()
|
||||||
|
|
||||||
|
} else if mime == mime_guess::mime::TEXT_XML {
|
||||||
|
// a somewhat similar case arises with XML files - the first suggested extension is "addin", when it should
|
||||||
|
// (in my opinion) be "xml".
|
||||||
|
// there's also another problem: SVG files can easily be misidentified as XML files, because they usually
|
||||||
|
// *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read
|
||||||
|
// before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg"
|
||||||
|
// as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered
|
||||||
|
// to have valid extensions.
|
||||||
|
[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
|
||||||
|
|
||||||
|
} else {
|
||||||
|
possible_exts
|
||||||
|
})
|
||||||
|
},
|
||||||
None => None
|
None => None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
12
src/main.rs
12
src/main.rs
|
@ -118,13 +118,13 @@ fn scan_file(entry: &DirEntry) -> Result<Findings, (ScanError, PathBuf)> {
|
||||||
Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_lowercase().into()),
|
Some(e) if entry_ext.is_some() => e.contains(&entry_ext.unwrap().to_lowercase().into()),
|
||||||
// there is a known set of extensions for this mimetype, but the file has no extension
|
// there is a known set of extensions for this mimetype, but the file has no extension
|
||||||
Some(_) => false,
|
Some(_) => false,
|
||||||
// there is no known set of extensions for this mimetype -- assume it's correct
|
// there is no known set of extensions for this mimetype :(
|
||||||
None => true,
|
None => false,
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(Findings {
|
Ok(Findings {
|
||||||
file: entry.path().to_path_buf(),
|
file: entry.path().to_path_buf(),
|
||||||
valid, // make this a function
|
valid,
|
||||||
mime: result,
|
mime: result,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -165,13 +165,13 @@ fn main() {
|
||||||
#[cfg(feature = "infer-backend")]
|
#[cfg(feature = "infer-backend")]
|
||||||
MIMEDB
|
MIMEDB
|
||||||
.set(mimedb::InferDb::init())
|
.set(mimedb::InferDb::init())
|
||||||
.or(Err("Failed to initialise MIMEDB"))
|
.or(Err("Failed to initialise Infer backend!"))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
#[cfg(feature = "xdg-mime-backend")]
|
#[cfg(feature = "xdg-mime-backend")]
|
||||||
MIMEDB
|
MIMEDB
|
||||||
.set(mimedb::XdgDb::init())
|
.set(mimedb::XdgDb::init())
|
||||||
.or(Err("Failed to initialise MIMEDB"))
|
.or(Err("Failed to initialise XDG Mime backend!"))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
debug!("Iterating directory: {:?}", args.dirs);
|
debug!("Iterating directory: {:?}", args.dirs);
|
||||||
|
@ -207,7 +207,7 @@ fn main() {
|
||||||
info!(
|
info!(
|
||||||
"{:?} should have file extension {}",
|
"{:?} should have file extension {}",
|
||||||
r.file,
|
r.file,
|
||||||
r.recommended_extension().unwrap()
|
r.recommended_extension().unwrap_or("???".into())
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
trace!("{:?} is totally fine", r.file)
|
trace!("{:?} is totally fine", r.file)
|
||||||
|
|
|
@ -17,25 +17,49 @@ pub struct InferDb {
|
||||||
impl MimeDb for InferDb {
|
impl MimeDb for InferDb {
|
||||||
fn init() -> Self {
|
fn init() -> Self {
|
||||||
let mut info = infer::Infer::new();
|
let mut info = infer::Infer::new();
|
||||||
// add a random file type just to make sure adding works and such
|
|
||||||
|
// jpeg2000 support because why the stinch not
|
||||||
info.add("image/jpeg2000", ".jp2", |buf| {
|
info.add("image/jpeg2000", ".jp2", |buf| {
|
||||||
buf.len() > 23
|
buf.len() > 23
|
||||||
&& buf[0] == 0x00
|
&& buf[..23] == b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A\x6A\x70\x32\x20"[..]
|
||||||
&& buf[1] == 0x00
|
});
|
||||||
&& buf[2] == 0x00
|
|
||||||
&& buf[3] == 0x0C
|
info.add("image/svg+xml", "svg", |buf| {
|
||||||
&& buf[4] == 0x6A
|
// before doing the moderately expensive SVG check, we should make sure that the input is actually SGML-ish
|
||||||
&& buf[5] == 0x50
|
// by "SGML-ish", i mean starts with anywhere from zero to ∞-1 whitespace characters, and then a less than sign,
|
||||||
&& buf[6] == 0x20
|
// and then there's some other stuff we don't care about right now
|
||||||
&& buf[7] == 0x20
|
|
||||||
&& buf[8] == 0x0D
|
// so, here comes our fancy pants """""SGML-ish validator"""""
|
||||||
&& buf[9] == 0x0A
|
for i in 0..buf.len() {
|
||||||
&& buf[10] == 0x87
|
match buf[i] {
|
||||||
&& buf[11] == 0x0A
|
// whitespace (according to https://www.w3.org/TR/xml/#NT-S)
|
||||||
&& buf[20] == 0x6A
|
b'\t' | b'\r' | b'\n' | b'\x20' => continue,
|
||||||
&& buf[21] == 0x70
|
b'<' => break,
|
||||||
&& buf[22] == 0x32
|
_ => return false
|
||||||
&& buf[23] == 0x20
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// finally, to check whether or not the file is an SVG:
|
||||||
|
// - split the buffer up into chunks separated by the less than sign
|
||||||
|
// - check to see if this chunk starts with any of these identifiers:
|
||||||
|
let identifiers: Vec<&[u8]> = vec![
|
||||||
|
"svg".as_bytes(),
|
||||||
|
"SVG".as_bytes(),
|
||||||
|
"!DOCTYPE svg".as_bytes(),
|
||||||
|
"!DOCTYPE SVG".as_bytes()
|
||||||
|
];
|
||||||
|
// - if it does, the nested `any` will short circuit and immediately return true, causing the parent `any` to do
|
||||||
|
// the same
|
||||||
|
// - and finally, if none of the chunks match, we'll return false
|
||||||
|
|
||||||
|
// TODO: this is kind of messy, i'd like to clean it up somehow :(
|
||||||
|
buf
|
||||||
|
.split(|c| *c == b'<')
|
||||||
|
.any(|buf| {
|
||||||
|
identifiers
|
||||||
|
.iter()
|
||||||
|
.any(|id| buf.starts_with(id))
|
||||||
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
// unmut
|
// unmut
|
||||||
|
|
Loading…
Reference in a new issue