better document support, print version properly, display version
This commit is contained in:
parent
c92dbbd075
commit
db94465bb7
9 changed files with 82 additions and 67 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -5,3 +5,4 @@ fif_*
|
||||||
*.sh
|
*.sh
|
||||||
!clippy.sh
|
!clippy.sh
|
||||||
cargo-timing*.html
|
cargo-timing*.html
|
||||||
|
todo.txt
|
||||||
|
|
2
Cargo.lock
generated
2
Cargo.lock
generated
|
@ -176,7 +176,7 @@ checksum = "de853764b47027c2e862a995c34978ffa63c1501f2e15f987ba11bd4f9bba193"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fif"
|
name = "fif"
|
||||||
version = "0.2.3+hotfix"
|
version = "0.2.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cached",
|
"cached",
|
||||||
"clap",
|
"clap",
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
[package]
|
[package]
|
||||||
name = "fif"
|
name = "fif"
|
||||||
description = "A command-line tool for detecting and optionally correcting files with incorrect extensions."
|
description = "A command-line tool for detecting and optionally correcting files with incorrect extensions."
|
||||||
version = "0.2.3+hotfix"
|
version = "0.2.4"
|
||||||
authors = ["Lynnesbian <lynne@bune.city>"]
|
authors = ["Lynnesbian <lynne@bune.city>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
license = "GPL-3.0-or-later"
|
license = "GPL-3.0-or-later"
|
||||||
|
|
|
@ -16,13 +16,17 @@ impl ExtensionSet {
|
||||||
Self::Images => mime_guess::get_mime_extensions_str("image/*"),
|
Self::Images => mime_guess::get_mime_extensions_str("image/*"),
|
||||||
Self::Videos => mime_guess::get_mime_extensions_str("video/*"),
|
Self::Videos => mime_guess::get_mime_extensions_str("video/*"),
|
||||||
Self::Audio => mime_guess::get_mime_extensions_str("audio/*"),
|
Self::Audio => mime_guess::get_mime_extensions_str("audio/*"),
|
||||||
Self::Documents => Some(&[
|
Self::Documents => Some(
|
||||||
"doc", "docx", "ppt", "pptx", "xls", "xlsx", "csv", "tsv", "pdf", "odt", "ods", "odp",
|
&[
|
||||||
][..]),
|
"pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx", "csv", "tsv", "odt", "ods", "odp", "oda", "rtf", "ps",
|
||||||
|
][..],
|
||||||
|
),
|
||||||
// many compressed file types follow the name scheme "application/x.+compressed.*" - maybe this can be used
|
// many compressed file types follow the name scheme "application/x.+compressed.*" - maybe this can be used
|
||||||
// somehow to extract extensions for compressed files from mime_guess?
|
// somehow to extract extensions for compressed files from mime_guess?
|
||||||
Self::Archives => Some(&["zip", "tar", "gz", "zst", "xz", "rar", "7z", "bz", "bz2"][..]),
|
Self::Archives => Some(&["zip", "tar", "gz", "zst", "xz", "rar", "7z", "bz", "bz2"][..]),
|
||||||
_ => todo!(),
|
_ => todo!(),
|
||||||
}.unwrap().to_vec()
|
}
|
||||||
|
.unwrap()
|
||||||
|
.to_vec()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -105,37 +105,38 @@ impl Format for Script {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn rename<W: Write>(&self, f: &mut W, from: &PathBuf, to: &PathBuf) -> io::Result<()> {
|
fn rename<W: Write>(&self, f: &mut W, from: &PathBuf, to: &PathBuf) -> io::Result<()> {
|
||||||
smart_write(f, &[
|
smart_write(
|
||||||
|
f,
|
||||||
|
&[
|
||||||
"mv -v -i -- ".into(),
|
"mv -v -i -- ".into(),
|
||||||
from.into(),
|
from.into(),
|
||||||
Writable::Space,
|
Writable::Space,
|
||||||
to.into(),
|
to.into(),
|
||||||
Writable::Newline
|
Writable::Newline,
|
||||||
])
|
],
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn no_known_extension<W: Write>(&self, f: &mut W, path: &PathBuf) -> io::Result<()> {
|
fn no_known_extension<W: Write>(&self, f: &mut W, path: &PathBuf) -> io::Result<()> {
|
||||||
smart_write(f, &[
|
smart_write(
|
||||||
"echo No known extension for ".into(),
|
f,
|
||||||
path.into(),
|
&["echo No known extension for ".into(), path.into(), Writable::Newline],
|
||||||
Writable::Newline
|
)
|
||||||
])
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn unreadable<W: Write>(&self, f: &mut W, path: &PathBuf) -> io::Result<()> {
|
fn unreadable<W: Write>(&self, f: &mut W, path: &PathBuf) -> io::Result<()> {
|
||||||
smart_write(f, &[
|
smart_write(f, &["# Failed to read ".into(), path.into(), Writable::Newline])
|
||||||
"# Failed to read ".into(),
|
|
||||||
path.into(),
|
|
||||||
Writable::Newline
|
|
||||||
])
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn unknown_type<W: Write>(&self, f: &mut W, path: &PathBuf) -> io::Result<()> {
|
fn unknown_type<W: Write>(&self, f: &mut W, path: &PathBuf) -> io::Result<()> {
|
||||||
smart_write(f, &[
|
smart_write(
|
||||||
|
f,
|
||||||
|
&[
|
||||||
"# Failed to detect mime type for ".into(),
|
"# Failed to detect mime type for ".into(),
|
||||||
path.into(),
|
path.into(),
|
||||||
Writable::Newline
|
Writable::Newline,
|
||||||
])
|
],
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn header<W: Write>(&self, _: &Entries, f: &mut W) -> io::Result<()> {
|
fn header<W: Write>(&self, _: &Entries, f: &mut W) -> io::Result<()> {
|
||||||
|
|
|
@ -12,22 +12,12 @@ use crate::mimedb::MimeDb;
|
||||||
|
|
||||||
// use log::{debug, warn};
|
// use log::{debug, warn};
|
||||||
|
|
||||||
// from looking at the files in https://github.com/bojand/infer/tree/master/src/matchers, the format with the largest
|
// rather than reading once into a large buffer, it tends to be faster to first try identifying the file from a small
|
||||||
// buffer size requirement for identification requires 262 bytes, and the next largest buffer necessary is only 131
|
// chunk read from the top, and *then* proceeding with the large buffer. many file formats can be easily identified by
|
||||||
// bytes. as only two formats need more than 128 bytes, it would be fairly reasonable to only read 128 bytes.
|
// the first 128 bytes. of course, not all formats can, and some (OOXML...) require reading a long ways in.
|
||||||
// unfortunately, the format that requires 262 bytes for identification is tar, an extremely popular format (in the *nix
|
|
||||||
// world, at least). however, tar files almost always appear wrapped in other formats (.tar.gz, .tar.zst, etc) anyway,
|
|
||||||
// so maybe it's fine...? maybe this should be configurable by the user? i don't know.
|
|
||||||
// empirical testing (or rather, starting from 256 and incrementing until it worked) reveals that xdg-mime requires
|
|
||||||
// at least 265 bytes to identify a tar file.
|
|
||||||
|
|
||||||
// additionally, since many formats can by identified with ≤64 bytes, it's worth reading 64 bytes, checking for the mime
|
const INITIAL_BUF_SIZE: usize = 128;
|
||||||
// type, and then reading the full 512 bytes if necessary. in most cases, this will end up being faster on the whole,
|
const BUF_SIZE: usize = 4096;
|
||||||
// even though two reads are needed for certain formats, unless the directory being scanned is predominantly made up of
|
|
||||||
// such formats.
|
|
||||||
|
|
||||||
const INITIAL_BUF_SIZE: usize = 64;
|
|
||||||
const BUF_SIZE: usize = 512;
|
|
||||||
|
|
||||||
pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
|
pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
|
||||||
let mut buffer = [0; INITIAL_BUF_SIZE];
|
let mut buffer = [0; INITIAL_BUF_SIZE];
|
||||||
|
@ -38,19 +28,27 @@ pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
|
||||||
#[allow(clippy::unused_io_amount)]
|
#[allow(clippy::unused_io_amount)]
|
||||||
file.read(&mut buffer)?;
|
file.read(&mut buffer)?;
|
||||||
|
|
||||||
let r = db.get_type(&buffer);
|
let r = db.get_type(&buffer).filter(|mime|
|
||||||
|
// some mime types should be investigated further, reading up to BUF_SIZE even if they've been determined already
|
||||||
|
// one such type is XML - there's many more specific types that can be determined by reading further (such as SVG)
|
||||||
|
mime != &mime_guess::mime::TEXT_XML
|
||||||
|
// another is ZIP - many file formats (DOCX, ODT, JAR...) are just ZIP files with particular data structures.
|
||||||
|
// determining that a file is in one of the MS office formats in particular requires looking quite far into the
|
||||||
|
// file.
|
||||||
|
&& mime != &Mime::from_str("application/zip").unwrap());
|
||||||
|
|
||||||
if r.is_some() {
|
if r.is_some() {
|
||||||
return Ok(r);
|
return Ok(r);
|
||||||
}
|
}
|
||||||
|
|
||||||
// attempt to read up to the BUF_SIZE bytes of the file.
|
// attempt to read up to the BUF_SIZE bytes of the file.
|
||||||
// we've already read the first 64 bytes into a buffer, but i can't see an obvious way to reuse those 64 bytes that's
|
// we've already read the first 128 bytes into a buffer, but i can't see an obvious way to reuse them in a way that's
|
||||||
// faster than simply moving the seek position back to the start of the file and re-reading the whole 512 bytes.
|
// faster than simply moving the seek position back to the start of the file and re-reading the whole BUF_SIZE bytes.
|
||||||
// for example, starting with a buffer of 64 bytes, then creating a new 512 byte buffer from the contents of the first
|
// for example, reading (BUF_SIZE - INITIAL_BUF_SIZE) bytes, then concatenating the original INITIAL_BUF_SIZE buffer
|
||||||
// buffer with (512 - 64) blank bytes, then finally reading the rest, is much slower than simply reading the file
|
// with this new one would presumably be faster - but it's not. i think it's more expensive to create the iterator,
|
||||||
// twice. i don't at all doubt that there IS a way to do this efficiently, and i can think of a way in principle, but
|
// collect the contents, etc. i'll have to look into this more. i don't at all doubt that there IS a way to do this
|
||||||
// i'm not sure how to express it in a way that is both idiomatic/safe and fast.
|
// efficiently, and i can think of a way in principle, but i'm not sure how to express it in a way that is both
|
||||||
|
// idiomatic/safe and fast.
|
||||||
let mut buffer = [0; BUF_SIZE];
|
let mut buffer = [0; BUF_SIZE];
|
||||||
file.seek(SeekFrom::Start(0))?;
|
file.seek(SeekFrom::Start(0))?;
|
||||||
file.read(&mut buffer)?;
|
file.read(&mut buffer)?;
|
||||||
|
@ -86,19 +84,9 @@ cached! {
|
||||||
// to have valid extensions.
|
// to have valid extensions.
|
||||||
[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
|
[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
|
||||||
|
|
||||||
} else if mime == Mime::from_str("application/zip").unwrap() {
|
} else if mime == Mime::from_str("application/msword").unwrap() {
|
||||||
// until proper document support is added, treat all common document extensions as zips
|
// classic office files considered harmful
|
||||||
[vec![
|
vec![String::from("doc"), String::from("xls"), String::from("ppt")]
|
||||||
String::from("zip"),
|
|
||||||
String::from("docx"),
|
|
||||||
String::from("pptx"),
|
|
||||||
String::from("xlsx"),
|
|
||||||
String::from("odt"),
|
|
||||||
String::from("ods"),
|
|
||||||
String::from("odp"),
|
|
||||||
String::from("pages"),
|
|
||||||
String::from("key"),
|
|
||||||
String::from("numbers")], possible_exts].concat()
|
|
||||||
} else {
|
} else {
|
||||||
possible_exts
|
possible_exts
|
||||||
})
|
})
|
||||||
|
|
|
@ -253,7 +253,8 @@ fn main() {
|
||||||
}
|
}
|
||||||
|
|
||||||
if results.is_empty() {
|
if results.is_empty() {
|
||||||
info!("All files have valid extensions!")
|
info!("All files have valid extensions!");
|
||||||
|
exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
match args.output_format {
|
match args.output_format {
|
||||||
|
|
|
@ -13,6 +13,13 @@ pub struct InferDb {
|
||||||
db: infer::Infer,
|
db: infer::Infer,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn open_document_check(buf: &[u8], kind: &str) -> bool {
|
||||||
|
let mime = format!("application/vnd.oasis.opendocument.{}", kind);
|
||||||
|
let mime = mime.as_bytes();
|
||||||
|
|
||||||
|
buf.len() > 38 + mime.len() && buf.starts_with(b"PK\x03\x04") && buf[38..mime.len() + 38] == mime[..]
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(feature = "infer-backend")]
|
#[cfg(feature = "infer-backend")]
|
||||||
impl MimeDb for InferDb {
|
impl MimeDb for InferDb {
|
||||||
fn init() -> Self {
|
fn init() -> Self {
|
||||||
|
@ -23,6 +30,18 @@ impl MimeDb for InferDb {
|
||||||
buf.len() > 23 && buf[..23] == b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A\x6A\x70\x32\x20"[..]
|
buf.len() > 23 && buf[..23] == b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A\x6A\x70\x32\x20"[..]
|
||||||
});
|
});
|
||||||
|
|
||||||
|
info.add("application/vnd.oasis.opendocument.text", "odt", |buf| {
|
||||||
|
open_document_check(buf, "text")
|
||||||
|
});
|
||||||
|
|
||||||
|
info.add("application/vnd.oasis.opendocument.spreadsheet", "ods", |buf| {
|
||||||
|
open_document_check(buf, "spreadsheet")
|
||||||
|
});
|
||||||
|
|
||||||
|
info.add("application/vnd.oasis.opendocument.presentation", "odp", |buf| {
|
||||||
|
open_document_check(buf, "presentation")
|
||||||
|
});
|
||||||
|
|
||||||
info.add("image/svg+xml", "svg", |buf| {
|
info.add("image/svg+xml", "svg", |buf| {
|
||||||
// before doing the moderately expensive SVG check, we should make sure that the input is actually SGML-ish
|
// before doing the moderately expensive SVG check, we should make sure that the input is actually SGML-ish
|
||||||
// by "SGML-ish", i mean starts with anywhere from zero to ∞-1 whitespace characters, and then a less than sign,
|
// by "SGML-ish", i mean starts with anywhere from zero to ∞-1 whitespace characters, and then a less than sign,
|
||||||
|
|
|
@ -11,6 +11,7 @@ pub enum OutputFormat {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clap, Debug)]
|
#[derive(Clap, Debug)]
|
||||||
|
#[clap(version = option_env!("CARGO_PKG_VERSION").unwrap_or("???"))]
|
||||||
pub struct Parameters {
|
pub struct Parameters {
|
||||||
/// Only examine files with these extensions (Comma-separated list)
|
/// Only examine files with these extensions (Comma-separated list)
|
||||||
#[clap(
|
#[clap(
|
||||||
|
|
Loading…
Reference in a new issue