diff --git a/.idea/runConfigurations/Run.xml b/.idea/runConfigurations/Run.xml
index f367ac8..e2e10c4 100644
--- a/.idea/runConfigurations/Run.xml
+++ b/.idea/runConfigurations/Run.xml
@@ -6,6 +6,7 @@
+
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
index 94a25f7..ef8c41b 100644
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -2,5 +2,6 @@
+
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 41c6bec..86c1f12 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
### Added
- AIFF (Audio Interchange File Format, a PCM audio format like WAV) detection to [`infer`] backend
- `--version` output now includes the (short) hash of the git commit fif was built from
+- `-j`/`--jobs` flag for specifying the number of threads fif should use for scanning files
+### Changed
+- fif will no longer use multithreading when scanning less than 32 files - the overhead of spawning threads isn't really
+ worth it
### Other
- Refactoring - split fif into `main.rs` and `lib.rs`, moved file-related functionality (directory scanning, etc.) into
files module, removed string module, etc.
diff --git a/Cargo.lock b/Cargo.lock
index 40ed294..8b79617 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,5 +1,7 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
+version = 3
+
[[package]]
name = "arrayvec"
version = "0.5.2"
@@ -186,6 +188,7 @@ dependencies = [
"log",
"mime",
"new_mime_guess",
+ "num_cpus",
"once_cell",
"rand",
"rayon",
diff --git a/Cargo.toml b/Cargo.toml
index ea0f63b..f3c5e87 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,7 +17,7 @@ maintenance = { status = "experimental" }
[features]
default = ["multi-threaded", "json"]
-multi-threaded = ["rayon"]
+multi-threaded = ["rayon", "num_cpus"]
infer-backend = ["infer"]
xdg-mime-backend = ["xdg-mime"]
json = ["serde", "serde_json"]
@@ -36,6 +36,7 @@ itertools = "0.10.0"
serde = { version = "1.0", features = ["derive"], optional = true }
serde_json = { version = "1.0", optional = true }
bitflags = "~1.2.1" # 1.3+ requires Rust >= 1.46
+num_cpus = { version = "1.13.0", optional = true }
[target.'cfg(not(unix))'.dependencies]
xdg-mime = { version = "0.3.3", optional = true }
diff --git a/src/files.rs b/src/files.rs
index d1072e4..9342067 100644
--- a/src/files.rs
+++ b/src/files.rs
@@ -136,25 +136,42 @@ pub fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result Vec> {
+pub fn scan_from_walkdir(
+ entries: &[DirEntry],
+ canonical_paths: bool,
+ use_threads: bool,
+) -> Vec> {
cfg_if! {
if #[cfg(feature = "multi-threaded")] {
use rayon::prelude::*;
+ const CHUNKS: usize = 32;
- // split the entries into chunks of 32, and iterate over each chunk of entries in a separate thread
- entries
- .par_chunks(32)
- .flat_map(|chunk| {
- chunk
- .iter() // iter over the chunk, which is a slice of DirEntry structs
- .map(|entry| scan_file(entry, canonical_paths))
- .collect::>()
- })
- .collect()
+ if use_threads && entries.len() > CHUNKS {
+ // split the entries into chunks of 32, and iterate over each chunk of entries in a separate thread
+ return entries
+ .par_chunks(CHUNKS)
+ .flat_map(|chunk| {
+ chunk
+ .iter() // iter over the chunk, which is a slice of DirEntry structs
+ .map(|entry| scan_file(entry, canonical_paths))
+ .collect::>() // TODO: is there a way to avoid having to collect here?
+ })
+ .collect()
+ }
} else {
- entries.iter().map(|entry: &DirEntry| scan_file(entry, canonical_paths)).collect()
+ // should always be false when multi-threading is disabled at compile time
+ assert!(!use_threads)
}
}
+
+ // if we end up here, either
+ // - there were less than CHUNKS files to scan, or
+ // - the user specified that only one thread should be used, by specifying `-j 1`
+ // - fif was compiled without the `multi-threading` feature
+ entries
+ .iter()
+ .map(|entry: &DirEntry| scan_file(entry, canonical_paths))
+ .collect()
}
/// Scans a given directory with [`WalkDir`], filters with [`wanted_file`], checks for errors, and returns a vector of
diff --git a/src/formats.rs b/src/formats.rs
index 9ac226e..8c6dc4c 100644
--- a/src/formats.rs
+++ b/src/formats.rs
@@ -7,7 +7,7 @@ use std::os::unix::ffi::OsStrExt;
use std::path::Path;
use cfg_if::cfg_if;
-use itertools::{Either, Itertools};
+use itertools::Itertools;
use snailquote::escape;
use crate::findings::ScanError;
@@ -338,6 +338,8 @@ pub struct Json;
#[cfg(feature = "json")]
impl Format for Json {
fn write_all(&self, f: &mut W, entries: &Entries) -> io::Result<()> {
+ use itertools::Either;
+
#[derive(serde::Serialize)]
struct SerdeEntries<'a> {
errors: &'a Vec<&'a ScanError<'a>>,
diff --git a/src/main.rs b/src/main.rs
index ac2a9a7..b21e6c3 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -20,6 +20,7 @@
use std::io::{stdout, BufWriter, Write};
use std::process::exit;
+use cfg_if::cfg_if;
use clap::Clap;
use fif::files::{scan_directory, scan_from_walkdir};
use fif::formats::Format;
@@ -87,7 +88,28 @@ fn main() {
trace!("Found {} items to check", entries.len());
- let results: Vec<_> = scan_from_walkdir(&entries, args.canonical_paths)
+ cfg_if! {
+ if #[cfg(feature = "multi-threaded")] {
+ let use_threads = args.jobs != 1;
+
+ if use_threads {
+ // 0 is a special case - it should be understood to mean "all available host CPUs"
+ let jobs = if args.jobs == 0 { num_cpus::get() } else { args.jobs };
+
+ // set up the global thread pool with the requested number of threads
+ rayon::ThreadPoolBuilder::new().num_threads(jobs).build_global().unwrap();
+ trace!("Multithreading enabled, using {} threads", jobs);
+ } else {
+ trace!("Multithreading disabled at runtime");
+ }
+
+ } else { // `multi-threading` feature disabled
+ let use_threads = false;
+ trace!("Multithreading disabled at compile time");
+ }
+ }
+
+ let results: Vec<_> = scan_from_walkdir(&entries, args.canonical_paths, use_threads)
.into_iter()
.filter(
|result| result.is_err() || !result.as_ref().unwrap().valid,
diff --git a/src/parameters.rs b/src/parameters.rs
index a58793f..4c27491 100644
--- a/src/parameters.rs
+++ b/src/parameters.rs
@@ -133,6 +133,14 @@ pub struct Parameters {
/// For example, with this option, fif will not rename "image.unknown" to "image.jpg"
#[clap(short = 'I', long)]
pub ignore_unknown_exts: bool,
+
+ #[cfg(feature = "multi-threaded")]
+ /// Number of jobs (threads) to use when scanning results.
+ /// The default behaviour is to use one thread per CPU thread. This behaviour can be manually requested by setting
+ /// `-j 0`. Using `-j 1` will disable multi-threading behaviour, as if you had compiled fif with the multi-threading
+ /// feature disabled. Setting more jobs than you have CPU threads is not recommended.
+ #[clap(short = 'j', long, default_value = "0")]
+ pub jobs: usize,
}
fn lowercase_exts(exts: &str) -> Result<(), String> {
diff --git a/src/tests/mod.rs b/src/tests/mod.rs
index cd500d8..ec9ad07 100644
--- a/src/tests/mod.rs
+++ b/src/tests/mod.rs
@@ -114,8 +114,10 @@ fn simple_directory() {
// there should be one file missing: "ignore.fake_ext"
assert_eq!(entries.len(), files.len() - 1);
- let results = scan_from_walkdir(&entries, false);
- let canonical_results = scan_from_walkdir(&entries, true);
+ let use_threads = cfg!(feature = "multi-threaded");
+
+ let results = scan_from_walkdir(&entries, false, use_threads);
+ let canonical_results = scan_from_walkdir(&entries, true, use_threads);
assert_eq!(results.len(), canonical_results.len());
for (result, canonical_result) in results.iter().zip(canonical_results.iter()) {
@@ -291,6 +293,8 @@ fn rejects_bad_args() {
vec!["fif", "-X", "pebis"],
// `-e` with nothing but commas:
vec!["fif", "-e", ",,,,,"],
+ // `-j` with a negative value:
+ vec!["fif", "-j", "-1"],
];
for test in &tests {