Compare commits

..

No commits in common. "db80955f245272321690d75627d56906e17c1e65" and "741048839cf8688d2a8694fda40876118b7a3647" have entirely different histories.

10 changed files with 25 additions and 94 deletions

View file

@ -6,7 +6,6 @@
<option name="requiredFeatures" value="true" /> <option name="requiredFeatures" value="true" />
<option name="allFeatures" value="false" /> <option name="allFeatures" value="false" />
<option name="emulateTerminal" value="false" /> <option name="emulateTerminal" value="false" />
<option name="withSudo" value="false" />
<option name="backtrace" value="SHORT" /> <option name="backtrace" value="SHORT" />
<envs> <envs>
<env name="RUST_LOG" value="debug" /> <env name="RUST_LOG" value="debug" />

View file

@ -2,6 +2,5 @@
<project version="4"> <project version="4">
<component name="VcsDirectoryMappings"> <component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" /> <mapping directory="$PROJECT_DIR$" vcs="Git" />
<mapping directory="$PROJECT_DIR$/src/walkdir" vcs="Git" />
</component> </component>
</project> </project>

View file

@ -5,15 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
[Semantic Versioning](https://semver.org/spec/v2.0.0.html). [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## Unreleased ## Unreleased
## v0.3.7 - 2021-09-25
### Added ### Added
- `-j`/`--jobs` flag for specifying the number of threads fif should use for scanning files
- AIFF (Audio Interchange File Format, a PCM audio format like WAV) detection to [`infer`] backend - AIFF (Audio Interchange File Format, a PCM audio format like WAV) detection to [`infer`] backend
- `--version` output now includes the (short) hash of the git commit fif was built from - `--version` output now includes the (short) hash of the git commit fif was built from
### Changed
- fif will no longer use multithreading when scanning less than 32 files - the overhead of spawning threads isn't really
worth it
### Other ### Other
- Refactoring - split fif into `main.rs` and `lib.rs`, moved file-related functionality (directory scanning, etc.) into - Refactoring - split fif into `main.rs` and `lib.rs`, moved file-related functionality (directory scanning, etc.) into
files module, removed string module, etc. files module, removed string module, etc.

5
Cargo.lock generated
View file

@ -1,7 +1,5 @@
# This file is automatically @generated by Cargo. # This file is automatically @generated by Cargo.
# It is not intended for manual editing. # It is not intended for manual editing.
version = 3
[[package]] [[package]]
name = "arrayvec" name = "arrayvec"
version = "0.5.2" version = "0.5.2"
@ -175,7 +173,7 @@ checksum = "de853764b47027c2e862a995c34978ffa63c1501f2e15f987ba11bd4f9bba193"
[[package]] [[package]]
name = "fif" name = "fif"
version = "0.3.7" version = "0.3.6"
dependencies = [ dependencies = [
"bitflags", "bitflags",
"cfg-if", "cfg-if",
@ -188,7 +186,6 @@ dependencies = [
"log", "log",
"mime", "mime",
"new_mime_guess", "new_mime_guess",
"num_cpus",
"once_cell", "once_cell",
"rand", "rand",
"rayon", "rayon",

View file

@ -1,7 +1,7 @@
[package] [package]
name = "fif" name = "fif"
description = "A command-line tool for detecting and optionally correcting files with incorrect extensions." description = "A command-line tool for detecting and optionally correcting files with incorrect extensions."
version = "0.3.7" version = "0.3.6"
authors = ["Lynnesbian <lynne@bune.city>"] authors = ["Lynnesbian <lynne@bune.city>"]
edition = "2018" edition = "2018"
license = "GPL-3.0-or-later" license = "GPL-3.0-or-later"
@ -17,7 +17,7 @@ maintenance = { status = "experimental" }
[features] [features]
default = ["multi-threaded", "json"] default = ["multi-threaded", "json"]
multi-threaded = ["rayon", "num_cpus"] multi-threaded = ["rayon"]
infer-backend = ["infer"] infer-backend = ["infer"]
xdg-mime-backend = ["xdg-mime"] xdg-mime-backend = ["xdg-mime"]
json = ["serde", "serde_json"] json = ["serde", "serde_json"]
@ -36,7 +36,6 @@ itertools = "0.10.0"
serde = { version = "1.0", features = ["derive"], optional = true } serde = { version = "1.0", features = ["derive"], optional = true }
serde_json = { version = "1.0", optional = true } serde_json = { version = "1.0", optional = true }
bitflags = "~1.2.1" # 1.3+ requires Rust >= 1.46 bitflags = "~1.2.1" # 1.3+ requires Rust >= 1.46
num_cpus = { version = "1.13.0", optional = true }
[target.'cfg(not(unix))'.dependencies] [target.'cfg(not(unix))'.dependencies]
xdg-mime = { version = "0.3.3", optional = true } xdg-mime = { version = "0.3.3", optional = true }

View file

@ -136,42 +136,25 @@ pub fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result<Findings, Sc
} }
/// Takes a slice of [`DirEntry`]s and calls [`scan_file`] on each one, returning the results in a vector. /// Takes a slice of [`DirEntry`]s and calls [`scan_file`] on each one, returning the results in a vector.
pub fn scan_from_walkdir( pub fn scan_from_walkdir(entries: &[DirEntry], canonical_paths: bool) -> Vec<Result<Findings, ScanError>> {
entries: &[DirEntry],
canonical_paths: bool,
use_threads: bool,
) -> Vec<Result<Findings, ScanError>> {
cfg_if! { cfg_if! {
if #[cfg(feature = "multi-threaded")] { if #[cfg(feature = "multi-threaded")] {
use rayon::prelude::*; use rayon::prelude::*;
const CHUNKS: usize = 32;
if use_threads && entries.len() > CHUNKS { // split the entries into chunks of 32, and iterate over each chunk of entries in a separate thread
// split the entries into chunks of 32, and iterate over each chunk of entries in a separate thread entries
return entries .par_chunks(32)
.par_chunks(CHUNKS) .flat_map(|chunk| {
.flat_map_iter(|chunk| { chunk
chunk .iter() // iter over the chunk, which is a slice of DirEntry structs
.iter() // iter over the chunk, which is a slice of DirEntry structs .map(|entry| scan_file(entry, canonical_paths))
.map(|entry| scan_file(entry, canonical_paths)) .collect::<Vec<_>>()
.collect::<Vec<_>>() // TODO: is there a way to avoid having to collect here? })
}) .collect()
.collect()
}
} else { } else {
// should always be false when multi-threading is disabled at compile time entries.iter().map(|entry: &DirEntry| scan_file(entry, canonical_paths)).collect()
assert!(!use_threads)
} }
} }
// if we end up here, either
// - there were less than CHUNKS files to scan, or
// - the user specified that only one thread should be used, by specifying `-j 1`
// - fif was compiled without the `multi-threading` feature
entries
.iter()
.map(|entry: &DirEntry| scan_file(entry, canonical_paths))
.collect()
} }
/// Scans a given directory with [`WalkDir`], filters with [`wanted_file`], checks for errors, and returns a vector of /// Scans a given directory with [`WalkDir`], filters with [`wanted_file`], checks for errors, and returns a vector of

View file

@ -7,7 +7,7 @@ use std::os::unix::ffi::OsStrExt;
use std::path::Path; use std::path::Path;
use cfg_if::cfg_if; use cfg_if::cfg_if;
use itertools::Itertools; use itertools::{Either, Itertools};
use snailquote::escape; use snailquote::escape;
use crate::findings::ScanError; use crate::findings::ScanError;
@ -338,8 +338,6 @@ pub struct Json;
#[cfg(feature = "json")] #[cfg(feature = "json")]
impl Format for Json { impl Format for Json {
fn write_all<W: Write>(&self, f: &mut W, entries: &Entries) -> io::Result<()> { fn write_all<W: Write>(&self, f: &mut W, entries: &Entries) -> io::Result<()> {
use itertools::Either;
#[derive(serde::Serialize)] #[derive(serde::Serialize)]
struct SerdeEntries<'a> { struct SerdeEntries<'a> {
errors: &'a Vec<&'a ScanError<'a>>, errors: &'a Vec<&'a ScanError<'a>>,

View file

@ -20,7 +20,6 @@
use std::io::{stdout, BufWriter, Write}; use std::io::{stdout, BufWriter, Write};
use std::process::exit; use std::process::exit;
use cfg_if::cfg_if;
use clap::Clap; use clap::Clap;
use fif::files::{scan_directory, scan_from_walkdir}; use fif::files::{scan_directory, scan_from_walkdir};
use fif::formats::Format; use fif::formats::Format;
@ -88,28 +87,7 @@ fn main() {
trace!("Found {} items to check", entries.len()); trace!("Found {} items to check", entries.len());
cfg_if! { let results: Vec<_> = scan_from_walkdir(&entries, args.canonical_paths)
if #[cfg(feature = "multi-threaded")] {
let use_threads = args.jobs != 1;
if use_threads {
// 0 is a special case - it should be understood to mean "all available host CPUs"
let jobs = if args.jobs == 0 { num_cpus::get() } else { args.jobs };
// set up the global thread pool with the requested number of threads
rayon::ThreadPoolBuilder::new().num_threads(jobs).build_global().unwrap();
trace!("Multithreading enabled, using {} threads", jobs);
} else {
trace!("Multithreading disabled at runtime");
}
} else { // `multi-threading` feature disabled
let use_threads = false;
trace!("Multithreading disabled at compile time");
}
}
let results: Vec<_> = scan_from_walkdir(&entries, args.canonical_paths, use_threads)
.into_iter() .into_iter()
.filter( .filter(
|result| result.is_err() || !result.as_ref().unwrap().valid, |result| result.is_err() || !result.as_ref().unwrap().valid,
@ -123,16 +101,12 @@ fn main() {
for result in &results { for result in &results {
match result { match result {
Ok(r) => { Ok(r) => {
// check to see if debug logging is enabled before invoking debug! macro debug!(
// https://github.com/rust-lang/log/pull/394#issuecomment-630490343 "{:?} is of type {}, should have extension \"{}\"",
if log::max_level() >= log::Level::Debug { r.file,
debug!( r.mime,
"{:?} is of type {}, should have extension \"{}\"", r.recommended_extension().unwrap_or_else(|| "???".into())
r.file, );
r.mime,
r.recommended_extension().unwrap_or_else(|| "???".into())
);
}
} }
Err(f) => warn!("{}", f), Err(f) => warn!("{}", f),
} }

View file

@ -133,14 +133,6 @@ pub struct Parameters {
/// For example, with this option, fif will not rename "image.unknown" to "image.jpg" /// For example, with this option, fif will not rename "image.unknown" to "image.jpg"
#[clap(short = 'I', long)] #[clap(short = 'I', long)]
pub ignore_unknown_exts: bool, pub ignore_unknown_exts: bool,
#[cfg(feature = "multi-threaded")]
/// Number of jobs (threads) to use when scanning results.
/// The default behaviour is to use one thread per CPU thread. This behaviour can be manually requested by setting
/// `-j 0`. Using `-j 1` will disable multi-threading behaviour, as if you had compiled fif with the multi-threading
/// feature disabled. Setting more jobs than you have CPU threads is not recommended.
#[clap(short = 'j', long, default_value = "0")]
pub jobs: usize,
} }
fn lowercase_exts(exts: &str) -> Result<(), String> { fn lowercase_exts(exts: &str) -> Result<(), String> {

View file

@ -114,10 +114,8 @@ fn simple_directory() {
// there should be one file missing: "ignore.fake_ext" // there should be one file missing: "ignore.fake_ext"
assert_eq!(entries.len(), files.len() - 1); assert_eq!(entries.len(), files.len() - 1);
let use_threads = cfg!(feature = "multi-threaded"); let results = scan_from_walkdir(&entries, false);
let canonical_results = scan_from_walkdir(&entries, true);
let results = scan_from_walkdir(&entries, false, use_threads);
let canonical_results = scan_from_walkdir(&entries, true, use_threads);
assert_eq!(results.len(), canonical_results.len()); assert_eq!(results.len(), canonical_results.len());
for (result, canonical_result) in results.iter().zip(canonical_results.iter()) { for (result, canonical_result) in results.iter().zip(canonical_results.iter()) {
@ -293,8 +291,6 @@ fn rejects_bad_args() {
vec!["fif", "-X", "pebis"], vec!["fif", "-X", "pebis"],
// `-e` with nothing but commas: // `-e` with nothing but commas:
vec!["fif", "-e", ",,,,,"], vec!["fif", "-e", ",,,,,"],
// `-j` with a negative value:
vec!["fif", "-j", "-1"],
]; ];
for test in &tests { for test in &tests {