first read a smol chunk, if we can't ID the file, read BUF_SIZE
This commit is contained in:
parent
6d49336e6b
commit
8fc3f18466
6 changed files with 28 additions and 5 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -2,4 +2,5 @@
|
||||||
/imgs
|
/imgs
|
||||||
fif_*
|
fif_*
|
||||||
/old
|
/old
|
||||||
|
/awful
|
||||||
*.sh
|
*.sh
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
<excludeFolder url="file://$MODULE_DIR$/target" />
|
<excludeFolder url="file://$MODULE_DIR$/target" />
|
||||||
<excludeFolder url="file://$MODULE_DIR$/imgs" />
|
<excludeFolder url="file://$MODULE_DIR$/imgs" />
|
||||||
<excludeFolder url="file://$MODULE_DIR$/old" />
|
<excludeFolder url="file://$MODULE_DIR$/old" />
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/awful" />
|
||||||
</content>
|
</content>
|
||||||
<orderEntry type="inheritedJdk" />
|
<orderEntry type="inheritedJdk" />
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
|
BIN
chunked
Executable file
BIN
chunked
Executable file
Binary file not shown.
0
src/formats.rs
Normal file
0
src/formats.rs
Normal file
|
@ -3,20 +3,41 @@ use std::path::Path;
|
||||||
use std::io;
|
use std::io;
|
||||||
use mime_guess::Mime;
|
use mime_guess::Mime;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::Read;
|
use std::io::{Read, Seek, SeekFrom};
|
||||||
use smartstring::alias::String;
|
use smartstring::alias::String;
|
||||||
use cached::proc_macro::cached;
|
use cached::proc_macro::cached;
|
||||||
|
use log::{debug, warn};
|
||||||
|
|
||||||
|
// from looking at the files in https://github.com/bojand/infer/tree/master/src/matchers, the format with the largest
|
||||||
|
// buffer size requirement for identification requires 262 bytes, and the next largest buffer necessary is only 131
|
||||||
|
// bytes. as only two formats need more than 128 bytes, it would be fairly reasonable to only read 128 bytes.
|
||||||
|
// unfortunately, the format that requires 262 bytes for identification is tar, an extremely popular format (in the *nix
|
||||||
|
// world, at least). however, tar files almost always appear wrapped in other formats (.tar.gz, .tar.zst, etc) anyway,
|
||||||
|
// so maybe it's fine...? maybe this should be configurable by the user? i don't know.
|
||||||
|
// empirical testing (or rather, starting from 256 and incrementing until it worked) reveals that mime_type requires
|
||||||
|
// at least 265 bytes to identify a tar file.
|
||||||
|
const BUF_SIZE: usize = 512;
|
||||||
|
|
||||||
pub fn mime_type(db: &SharedMimeInfo, path: &Path) -> io::Result<Option<Mime>, > {
|
pub fn mime_type(db: &SharedMimeInfo, path: &Path) -> io::Result<Option<Mime>, > {
|
||||||
// attempt to read up to the 256 bytes of the file
|
// attempt to read up to the BUF_SIZE bytes of the file
|
||||||
let mut buffer = [0; 256];
|
let mut buffer = [0; 64];
|
||||||
let mut file = File::open(path)?;
|
let mut file = File::open(path)?;
|
||||||
|
|
||||||
// this can be ignored because it's perfectly okay if the file is less than 256 bytes long - we only care about the
|
// this lint can be ignored: it's okay if the file isn't long enough to fill the buffer, as we only care about the
|
||||||
// first few bytes for the purpose of mime sniffing
|
// first few bytes for the purpose of mime sniffing
|
||||||
#[allow(clippy::unused_io_amount)]
|
#[allow(clippy::unused_io_amount)]
|
||||||
file.read(&mut buffer)?;
|
file.read(&mut buffer)?;
|
||||||
|
|
||||||
|
|
||||||
|
let r = db.get_mime_type_for_data(&buffer).map(|m| m.0);
|
||||||
|
if r.is_some() {
|
||||||
|
return Ok(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut buffer = [0; BUF_SIZE];
|
||||||
|
file.seek(SeekFrom::Start(0))?;
|
||||||
|
file.read(&mut buffer)?;
|
||||||
|
// warn!("dang");
|
||||||
Ok(db.get_mime_type_for_data(&buffer).map(|m| m.0))
|
Ok(db.get_mime_type_for_data(&buffer).map(|m| m.0))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue