first read a smol chunk, if we can't ID the file, read BUF_SIZE

This commit is contained in:
Lynne Megido 2021-02-06 21:48:31 +10:00
parent 6d49336e6b
commit 8fc3f18466
Signed by: lynnesbian
GPG key ID: F0A184B5213D9F90
6 changed files with 28 additions and 5 deletions

1
.gitignore vendored
View file

@ -2,4 +2,5 @@
/imgs /imgs
fif_* fif_*
/old /old
/awful
*.sh *.sh

View file

@ -6,6 +6,7 @@
<excludeFolder url="file://$MODULE_DIR$/target" /> <excludeFolder url="file://$MODULE_DIR$/target" />
<excludeFolder url="file://$MODULE_DIR$/imgs" /> <excludeFolder url="file://$MODULE_DIR$/imgs" />
<excludeFolder url="file://$MODULE_DIR$/old" /> <excludeFolder url="file://$MODULE_DIR$/old" />
<excludeFolder url="file://$MODULE_DIR$/awful" />
</content> </content>
<orderEntry type="inheritedJdk" /> <orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />

View file

@ -26,4 +26,4 @@ default-features = false
features = ["termcolor", "atty"] features = ["termcolor", "atty"]
[profile.release] [profile.release]
lto = "thin" lto = "thin"

BIN
chunked Executable file

Binary file not shown.

0
src/formats.rs Normal file
View file

View file

@ -3,20 +3,41 @@ use std::path::Path;
use std::io; use std::io;
use mime_guess::Mime; use mime_guess::Mime;
use std::fs::File; use std::fs::File;
use std::io::Read; use std::io::{Read, Seek, SeekFrom};
use smartstring::alias::String; use smartstring::alias::String;
use cached::proc_macro::cached; use cached::proc_macro::cached;
use log::{debug, warn};
// from looking at the files in https://github.com/bojand/infer/tree/master/src/matchers, the format with the largest
// buffer size requirement for identification requires 262 bytes, and the next largest buffer necessary is only 131
// bytes. as only two formats need more than 128 bytes, it would be fairly reasonable to only read 128 bytes.
// unfortunately, the format that requires 262 bytes for identification is tar, an extremely popular format (in the *nix
// world, at least). however, tar files almost always appear wrapped in other formats (.tar.gz, .tar.zst, etc) anyway,
// so maybe it's fine...? maybe this should be configurable by the user? i don't know.
// empirical testing (or rather, starting from 256 and incrementing until it worked) reveals that mime_type requires
// at least 265 bytes to identify a tar file.
const BUF_SIZE: usize = 512;
pub fn mime_type(db: &SharedMimeInfo, path: &Path) -> io::Result<Option<Mime>, > { pub fn mime_type(db: &SharedMimeInfo, path: &Path) -> io::Result<Option<Mime>, > {
// attempt to read up to the 256 bytes of the file // attempt to read up to the BUF_SIZE bytes of the file
let mut buffer = [0; 256]; let mut buffer = [0; 64];
let mut file = File::open(path)?; let mut file = File::open(path)?;
// this can be ignored because it's perfectly okay if the file is less than 256 bytes long - we only care about the // this lint can be ignored: it's okay if the file isn't long enough to fill the buffer, as we only care about the
// first few bytes for the purpose of mime sniffing // first few bytes for the purpose of mime sniffing
#[allow(clippy::unused_io_amount)] #[allow(clippy::unused_io_amount)]
file.read(&mut buffer)?; file.read(&mut buffer)?;
let r = db.get_mime_type_for_data(&buffer).map(|m| m.0);
if r.is_some() {
return Ok(r);
}
let mut buffer = [0; BUF_SIZE];
file.seek(SeekFrom::Start(0))?;
file.read(&mut buffer)?;
// warn!("dang");
Ok(db.get_mime_type_for_data(&buffer).map(|m| m.0)) Ok(db.get_mime_type_for_data(&buffer).map(|m| m.0))
} }