added comments and mach-o binary support to infer

This commit is contained in:
Lynne Megido 2022-01-18 18:44:46 +10:00
parent a0396e2e1e
commit 330b273be6
Signed by: lynnesbian
GPG Key ID: F0A184B5213D9F90
5 changed files with 40 additions and 8 deletions

View File

@ -4,6 +4,10 @@ Dates are given in YYYY-MM-DD format - for example, the 15th of October 2021 is
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to
[Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## Unreleased
### Added
- When using the [`infer`] backend, fif is now able to detect [Mach-O](https://en.wikipedia.org/wiki/Mach-O) binaries
## v0.5.0 - 2022-01-01
### Changed
- The Minimum Supported Rust Version (MSRV) is now **1.54.0**.

8
Cargo.lock generated
View File

@ -80,9 +80,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "clap"
version = "3.0.7"
version = "3.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "12e8611f9ae4e068fa3e56931fded356ff745e70987ff76924a6e0ab1c8ef2e3"
checksum = "8c506244a13c87262f84bf16369740d0b7c3850901b6a642aa41b031a710c473"
dependencies = [
"atty",
"bitflags",
@ -754,9 +754,9 @@ dependencies = [
[[package]]
name = "siphasher"
version = "0.3.8"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba1eead9e94aa5a2e02de9e7839f96a007f686ae7a1d57c7797774810d24908a"
checksum = "a86232ab60fa71287d7f2ddae4a7073f6b7aac33631c3015abb556f08c6d0a3e"
[[package]]
name = "smallvec"

View File

@ -42,28 +42,56 @@ cfg_if! {
fn init() -> Self {
let mut info = infer::Infer::new();
// In addition to the file inferences provided by Infer, I've also added a few of my own below. Some of them
// replace Infer's existing ones, some of them are less than perfect, and still others are for relatively
// obscure formats, so I'm not really sure whether or not they should be contributed upstream.
// OpenDocument Text (used by e.g. LibreOffice Writer)
info.add("application/vnd.oasis.opendocument.text", "odt", |buf| {
open_document_check(buf, "text")
});
// OpenDocument Spreadsheet (LibreOffice Calc)
info.add("application/vnd.oasis.opendocument.spreadsheet", "ods", |buf| {
open_document_check(buf, "spreadsheet")
});
// OpenOffice Presentation (LibreOffice Impress)
info.add("application/vnd.oasis.opendocument.presentation", "odp", |buf| {
open_document_check(buf, "presentation")
});
// Ren'Py Archive (Ren'Py: https://www.renpy.org/)
info.add("application/x-rpa", "rpa", |buf| {
buf.len() >= 34 && buf.starts_with(b"RPA-") && buf[7] == b' ' && buf[24] ==b' '
});
// Mach-O Binaries (The executable format used by macOS)
// my source for most of this info is this article: https://h3adsh0tzz.com/2020/01/macho-file-format/
// like linux's ELF binaries, mach-o binaries do not typically have an extension, but if they did, it'd
// probably be something like ".macho", so, that'll do i guess. fif doesn't actually use the extensions
// specified here anyway.
info.add("application/x-mach-binary", "macho", |buf| {
// a 32-bit mach-o header occupies 28 bits of space, so any input smaller than that cannot be a mach-o
// binary, even if it starts with the magic numbers.
// the three magic numbers that can appear are 0xFEEDFACF, 0xFEEDFACE, and 0xCAFEBABE. the code below
// checks for all three of these, in both big and little endian order.
// java class files also start with 0xCAFEBABE. since infer doesn't support detecting these files,
// collisions are not an issue. if, however, infer does gain support for identifying java class files, the
// 0xCAFEBABE check should be removed, as java bytecode files are far more prevalent than 32-bit universal
// mach-o binaries [citation needed].
buf.len() >= 28 && [b"\xFE\xED\xFA\xCF", b"\xFE\xED\xFA\xCE", b"\xCA\xFE\xBA\xBE", b"\xCF\xFA\xED\xFE",
b"\xCE\xFA\xED\xFE", b"\xBE\xBA\xFE\xCA"].iter().any(|magic_numbers| buf.starts_with(&magic_numbers[..]))
});
// info.add("application/x-msi", "msi", |buf| {
// TODO: find a way to detect MSI files properly - this just detects those weird windows OLE files and therefore
// also picks up on .doc files
// buf.starts_with(b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1")
// });
// Scalable Vector Graphics
info.add("image/svg+xml", "svg", |buf| {
// before doing the moderately expensive SVG check, we should make sure that the input is actually SGML-ish,
// by which i mean, starts with anywhere from zero to ∞-1 whitespace characters, and then a less than sign,
@ -83,8 +111,8 @@ cfg_if! {
// - split the buffer up into chunks separated by the less than sign
// - check to see if this chunk starts with any of these identifiers:
let identifiers: Vec<&[u8]> = vec![b"svg", b"SVG", b"!DOCTYPE svg", b"!DOCTYPE SVG"];
// - if it does, the nested `any` will short circuit and immediately return true, causing the parent `any` to do
// the same
// - if it does, the nested `any` will short circuit and immediately return true, causing the parent `any` to
// do the same
// - and finally, if none of the chunks match, we'll return false
// TODO: this is kind of messy, i'd like to clean it up somehow :(

View File

@ -7,7 +7,7 @@ use std::collections::BTreeSet;
use std::path::PathBuf;
use cfg_if::cfg_if;
use clap::{Parser, ArgEnum};
use clap::{ArgEnum, Parser};
use crate::utils::{CLAP_LONG_VERSION, CLAP_VERSION};
use crate::String as StringType;

View File

@ -351,6 +351,7 @@ fn accepts_good_args() {
/// Ensures that output from the `-V` and `--version` flags is formatted properly.
fn check_version_output() {
use std::string::String;
use assert_cmd::Command;
use regex::Regex;
@ -364,7 +365,6 @@ fn check_version_output() {
output
);
// test `--version` matches the format of "fif x.y.z (OS, example backend, commit #1234abc)"
let mut cmd = Command::cargo_bin("fif").unwrap();
let output = cmd.arg("--version").ok().unwrap().stdout;