2022-01-22 16:41:24 +00:00
|
|
|
// SPDX-FileCopyrightText: 2021-2022 Lynnesbian
|
|
|
|
// SPDX-License-Identifier: GPL-3.0-or-later
|
2021-10-05 14:24:08 +00:00
|
|
|
|
2021-09-24 14:53:35 +00:00
|
|
|
//! File handling - scanning, detecting MIME types, and so on.
|
|
|
|
|
2021-09-24 12:02:04 +00:00
|
|
|
use std::collections::{BTreeSet, HashMap};
|
2021-08-28 08:09:15 +00:00
|
|
|
use std::fs::File;
|
2021-09-24 14:53:35 +00:00
|
|
|
use std::io::{self, Read, Seek, SeekFrom};
|
2021-08-28 08:09:15 +00:00
|
|
|
use std::path::Path;
|
|
|
|
use std::str::FromStr;
|
|
|
|
|
2021-08-28 07:59:04 +00:00
|
|
|
use cfg_if::cfg_if;
|
2021-10-04 10:22:15 +00:00
|
|
|
use itertools::{Either, Itertools};
|
2021-08-28 07:59:04 +00:00
|
|
|
use log::{debug, error};
|
2021-08-28 08:09:15 +00:00
|
|
|
use mime::Mime;
|
2021-08-28 07:59:04 +00:00
|
|
|
use mime_guess::from_ext;
|
2021-09-24 12:02:04 +00:00
|
|
|
use once_cell::sync::Lazy;
|
2021-11-22 22:38:43 +00:00
|
|
|
use parking_lot::RwLock;
|
2021-08-28 07:59:04 +00:00
|
|
|
use walkdir::{DirEntry, WalkDir};
|
2021-08-28 07:58:30 +00:00
|
|
|
|
2021-09-24 08:11:25 +00:00
|
|
|
use crate::findings::{Findings, ScanError};
|
|
|
|
use crate::mime_db::MimeDb;
|
|
|
|
use crate::parameters::ScanOpts;
|
2021-11-24 20:26:57 +00:00
|
|
|
use crate::utils::APPLICATION_ZIP;
|
2021-11-24 20:29:27 +00:00
|
|
|
use crate::{String, MIMEDB};
|
2021-09-24 08:11:25 +00:00
|
|
|
|
2021-11-24 20:29:27 +00:00
|
|
|
/// Cache of MIME types and their associated extensions, used by [`mime_extension_lookup()`]
|
2021-09-24 12:02:04 +00:00
|
|
|
static MIMEXT: Lazy<RwLock<HashMap<String, Option<Vec<String>>>>> = Lazy::new(|| RwLock::new(HashMap::new()));
|
|
|
|
|
2021-11-24 20:26:57 +00:00
|
|
|
/// The number of bytes to read initially when identifying a file's MIME type. Used in the [`mime_type`] function.
|
|
|
|
///
|
|
|
|
/// Rather than reading the entire file all at once into a [`BUF_SIZE`] buffer, it tends to be faster to read a small
|
|
|
|
/// chunk of the file and trying to identify that, proceeding with the larger buffer if that fails. Many file formats
|
|
|
|
/// can be identified with the first few dozen bytes, so the "happy path" will likely be taken in the majority of cases.
|
|
|
|
pub const INITIAL_BUF_SIZE: usize = 128;
|
|
|
|
|
|
|
|
/// The number of bytes to read if the file couldn't be identified from its first [`INITIAL_BUF_SIZE`] bytes. Used in
|
|
|
|
/// the [`mime_type`] function.
|
|
|
|
pub const BUF_SIZE: usize = 8192;
|
|
|
|
|
2021-11-24 20:29:27 +00:00
|
|
|
/// A [`Mime`] representing the "application/x-ole-storage" MIME type.
|
2021-11-24 20:26:57 +00:00
|
|
|
static APPLICATION_X_OLE_STORAGE: Lazy<Mime> = Lazy::new(|| Mime::from_str("application/x-ole-storage").unwrap());
|
|
|
|
|
2021-08-28 07:58:30 +00:00
|
|
|
cfg_if! {
|
|
|
|
if #[cfg(windows)] {
|
|
|
|
/// Determines whether or not a file is hidden by checking its win32 file attributes.
|
|
|
|
pub fn is_hidden(entry: &DirEntry) -> bool {
|
2021-11-05 15:10:20 +00:00
|
|
|
use std::os::windows::prelude::*;
|
|
|
|
const FILE_ATTRIBUTE_HIDDEN: u32 = 0x2; // http://docs.microsoft.com/windows/win32/fileio/file-attribute-constants
|
|
|
|
std::fs::metadata(entry.path()) // try to get metadata for file
|
|
|
|
.map_or(
|
|
|
|
false, // if getting metadata/attributes fails, assume it's not hidden
|
|
|
|
|f| f.file_attributes() & FILE_ATTRIBUTE_HIDDEN > 0,
|
|
|
|
)
|
2021-08-28 07:58:30 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/// Determines whether or not a file is hidden by checking for a leading full stop.
|
|
|
|
pub fn is_hidden(entry: &DirEntry) -> bool {
|
|
|
|
entry
|
|
|
|
.file_name()
|
|
|
|
.to_str()
|
|
|
|
.map_or(false, |f| f.starts_with('.') && f != ".")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns `true` if a file matches the given criteria. This means checking whether the file's extension appears in
|
|
|
|
/// `exts` (if specified), potentially skipping over hidden files, and so on.
|
|
|
|
pub fn wanted_file(
|
|
|
|
entry: &DirEntry,
|
|
|
|
exts: Option<&BTreeSet<&str>>,
|
|
|
|
exclude: Option<&BTreeSet<&str>>,
|
|
|
|
scan_opts: &ScanOpts,
|
|
|
|
) -> bool {
|
|
|
|
if entry.depth() == 0 {
|
|
|
|
// the root directory should always be scanned.
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if !scan_opts.hidden && is_hidden(entry) {
|
|
|
|
// skip hidden files and directories. this check is performed first because it's very lightweight.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if entry.file_type().is_dir() {
|
|
|
|
// always allow directories - there's no point doing file extension matching on something that isn't a file.
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if let Some(ext) = entry.path().extension() {
|
|
|
|
// file has extension - discard invalid UTF-8 and normalise it to lowercase.
|
|
|
|
let ext = ext.to_string_lossy().to_lowercase();
|
|
|
|
let ext = ext.as_str();
|
|
|
|
|
|
|
|
if scan_opts.ignore_unknown_exts && from_ext(ext).is_empty() {
|
|
|
|
// unknown extension, skip.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if let Some(exts) = exts {
|
|
|
|
// only scan if the file has one of the specified extensions.
|
|
|
|
exts.contains(&ext)
|
|
|
|
} else {
|
|
|
|
// no extensions specified - the file should be scanned unless its extension is on the exclude list.
|
|
|
|
exclude.map_or(true, |exclude| !exclude.contains(&ext))
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// no file extension
|
|
|
|
scan_opts.extensionless
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Inspects the given entry, returning a [`Findings`] on success and a [`ScanError`] on failure.
|
|
|
|
///
|
|
|
|
/// In the event of an IO error, the returned [`ScanError`] will be of type [`ScanError::File`]. Otherwise, a
|
2021-11-24 20:29:27 +00:00
|
|
|
/// [`ScanError::Mime`] will be returned, meaning that the file was scanned successfully, but a MIME type could not be
|
2021-08-28 07:58:30 +00:00
|
|
|
/// determined.
|
|
|
|
pub fn scan_file(entry: &DirEntry, canonical_paths: bool) -> Result<Findings, ScanError> {
|
|
|
|
let path = entry.path();
|
2021-11-24 20:29:27 +00:00
|
|
|
// try to determine MIME type for this entry
|
2021-09-24 14:53:35 +00:00
|
|
|
let result = match mime_type(&*MIMEDB, path) {
|
2021-08-28 07:58:30 +00:00
|
|
|
// an error occurred while trying to read the file
|
|
|
|
Err(_) => return Err(ScanError::File(path)),
|
2021-11-24 20:29:27 +00:00
|
|
|
// the file was read successfully, but we were unable to determine its MIME type
|
2021-08-28 07:58:30 +00:00
|
|
|
Ok(None) => return Err(ScanError::Mime(path)),
|
2021-11-24 20:29:27 +00:00
|
|
|
// a MIME type was found!
|
2021-08-28 07:58:30 +00:00
|
|
|
Ok(Some(result)) => result,
|
|
|
|
};
|
|
|
|
|
2022-05-02 07:55:48 +00:00
|
|
|
|
|
|
|
// determine whether or not the file's current extension is valid
|
|
|
|
let valid = if let Some(entry_ext) = path.extension() {
|
|
|
|
// discard invalid UTF-8 and convert to lowercase. all extensions in both backend's databases are lowercase
|
|
|
|
// ascii, so this assumption is fine.
|
|
|
|
let entry_ext = entry_ext.to_string_lossy().to_lowercase();
|
|
|
|
|
|
|
|
// if the file has any of these extensions, it is probably either:
|
|
|
|
// - a copy of another file renamed for backup purposes (e.g. a word processor might save by renaming "my.doc" to
|
|
|
|
// "my.doc.bak", then creating "my.doc", leaving the backup for safekeeping), which shouldn't be renamed so as
|
|
|
|
// not to break the backup program
|
|
|
|
// - a partially downloaded file, which shouldn't be renamed to avoid corrupting it and blocking the downloader
|
|
|
|
// from resuming
|
|
|
|
if ["bak", "backup", "filepart", "part", "crdownload"]
|
|
|
|
.iter()
|
|
|
|
.any(|ext| ext == &entry_ext)
|
|
|
|
{
|
|
|
|
true
|
|
|
|
} else {
|
|
|
|
// otherwise, check to see whether there's a known extension for this file type
|
|
|
|
|
|
|
|
// retrieve set of known extensions for the given MIME type
|
|
|
|
let known_exts = mime_extension_lookup(result.essence_str().into());
|
|
|
|
match known_exts {
|
|
|
|
// there is a known set of extensions for this MIME type - is entry_ext in the given set?
|
|
|
|
Some(e) => e.contains(&entry_ext.into()),
|
|
|
|
// there is no known set of extensions for this MIME type :(
|
|
|
|
None => false,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// this file has no extension
|
|
|
|
false
|
2021-08-28 07:58:30 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
let path = if canonical_paths {
|
|
|
|
match std::fs::canonicalize(path) {
|
|
|
|
Ok(path) => path,
|
|
|
|
Err(_) => return Err(ScanError::File(entry.path())),
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
path.to_path_buf() // :c
|
|
|
|
};
|
|
|
|
|
|
|
|
Ok(Findings {
|
|
|
|
file: path,
|
|
|
|
valid,
|
|
|
|
mime: result,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Takes a slice of [`DirEntry`]s and calls [`scan_file`] on each one, returning the results in a vector.
|
2021-09-25 08:55:50 +00:00
|
|
|
pub fn scan_from_walkdir(
|
|
|
|
entries: &[DirEntry],
|
|
|
|
canonical_paths: bool,
|
|
|
|
use_threads: bool,
|
2021-10-04 10:22:15 +00:00
|
|
|
) -> (Vec<Findings>, Vec<ScanError>) {
|
2021-08-28 07:58:30 +00:00
|
|
|
cfg_if! {
|
|
|
|
if #[cfg(feature = "multi-threaded")] {
|
|
|
|
use rayon::prelude::*;
|
2021-09-25 08:55:50 +00:00
|
|
|
const CHUNKS: usize = 32;
|
|
|
|
|
|
|
|
if use_threads && entries.len() > CHUNKS {
|
|
|
|
// split the entries into chunks of 32, and iterate over each chunk of entries in a separate thread
|
|
|
|
return entries
|
|
|
|
.par_chunks(CHUNKS)
|
2021-10-04 15:45:18 +00:00
|
|
|
.flat_map_iter(|chunk| {
|
2021-09-25 08:55:50 +00:00
|
|
|
chunk
|
|
|
|
.iter() // iter over the chunk, which is a slice of DirEntry structs
|
2021-10-04 15:45:18 +00:00
|
|
|
.map(|entry| scan_file(entry, canonical_paths))
|
2021-11-05 15:10:20 +00:00
|
|
|
.collect::<Vec<_>>()
|
2021-10-04 15:45:18 +00:00
|
|
|
}).partition_map(|result| match result {
|
|
|
|
Ok(f) => Either::Left(f),
|
|
|
|
Err(e) => Either::Right(e),
|
|
|
|
});
|
2021-09-25 08:55:50 +00:00
|
|
|
}
|
2021-08-28 07:58:30 +00:00
|
|
|
} else {
|
2021-09-25 08:55:50 +00:00
|
|
|
// should always be false when multi-threading is disabled at compile time
|
|
|
|
assert!(!use_threads)
|
2021-08-28 07:58:30 +00:00
|
|
|
}
|
|
|
|
}
|
2021-09-25 08:55:50 +00:00
|
|
|
|
|
|
|
// if we end up here, either
|
|
|
|
// - there were less than CHUNKS files to scan, or
|
|
|
|
// - the user specified that only one thread should be used, by specifying `-j 1`
|
|
|
|
// - fif was compiled without the `multi-threading` feature
|
|
|
|
entries
|
|
|
|
.iter()
|
2021-10-04 10:22:15 +00:00
|
|
|
.partition_map(|entry: &DirEntry| match scan_file(entry, canonical_paths) {
|
|
|
|
Ok(f) => Either::Left(f),
|
|
|
|
Err(e) => Either::Right(e),
|
|
|
|
})
|
2021-08-28 07:58:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Scans a given directory with [`WalkDir`], filters with [`wanted_file`], checks for errors, and returns a vector of
|
2021-10-13 13:53:55 +00:00
|
|
|
/// [`DirEntry`]s.
|
2021-08-28 07:58:30 +00:00
|
|
|
pub fn scan_directory(
|
|
|
|
dirs: &Path,
|
|
|
|
exts: Option<&BTreeSet<&str>>,
|
|
|
|
exclude: Option<&BTreeSet<&str>>,
|
|
|
|
scan_opts: &ScanOpts,
|
|
|
|
) -> Option<Vec<DirEntry>> {
|
|
|
|
let stepper = WalkDir::new(dirs).follow_links(scan_opts.follow_symlinks).into_iter();
|
|
|
|
let mut probably_fatal_error = false;
|
|
|
|
let entries: Vec<DirEntry> = stepper
|
|
|
|
.filter_entry(|e| wanted_file(e, exts, exclude, scan_opts)) // filter out unwanted files
|
|
|
|
.filter_map(|e| {
|
|
|
|
if let Err(err) = &e {
|
|
|
|
debug!("uh oh spaghettio!! {:#?}", e);
|
|
|
|
// log errors to stdout, and remove them from the iterator
|
|
|
|
let path = err.path().map_or("General error".into(), Path::to_string_lossy);
|
|
|
|
|
|
|
|
if err.depth() == 0 {
|
|
|
|
// if something goes wrong while trying to read the root directory, we're probably not going to get much done
|
|
|
|
probably_fatal_error = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: is there a way to just say `map_or(x, |y| y).thing()` instead of `map_or(x.thing(), |y| y.thing())`?
|
|
|
|
// i don't care whether i'm returning a walkdir error or an io error, i just care about whether or not it
|
|
|
|
// implements ToString (which they both do). map_or doesn't work on trait objects though :(
|
2021-11-05 15:30:34 +00:00
|
|
|
error!("{}: {}", path, err.io_error().map_or(err.to_string(), |e| e.to_string()));
|
2021-08-28 07:58:30 +00:00
|
|
|
return None;
|
|
|
|
}
|
|
|
|
e.ok()
|
|
|
|
})
|
|
|
|
// remove directories from the final list
|
|
|
|
.filter(|e| !e.file_type().is_dir())
|
|
|
|
// if fif is invoked without `-f` on a symlinked directory, it will recurse into the symlink (as desired) and ignore
|
|
|
|
// any symlinks inside the symlinked root directory. however, the root directory will still be added to `entries` as
|
|
|
|
// if it were a file to be scanned, and `scan_file` will fail to scan it, adding "Failed to read ~/whatever" to the
|
|
|
|
// output. to avoid this, we can remove all symlinks from `entries` if `-f` is not set. i know this is kind of
|
|
|
|
// confusing, but it's honestly kind of hard to explain... maybe a screenshot is better:
|
|
|
|
// https://i.imgur.com/DYG7jlB.png
|
|
|
|
// adding the symlink filter removes the line that's being pointed to in the image. 0u0
|
|
|
|
.filter(|e| scan_opts.follow_symlinks || !e.file_type().is_symlink())
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
if probably_fatal_error {
|
|
|
|
None
|
|
|
|
} else {
|
|
|
|
Some(entries)
|
|
|
|
}
|
2021-08-28 07:59:04 +00:00
|
|
|
}
|
2021-08-28 08:09:15 +00:00
|
|
|
|
2021-11-24 20:29:27 +00:00
|
|
|
/// Tries to identify the MIME type of a file from a given path.
|
2021-08-28 08:09:15 +00:00
|
|
|
pub fn mime_type<T: MimeDb>(db: &T, path: &Path) -> io::Result<Option<Mime>> {
|
|
|
|
let mut buffer = [0; INITIAL_BUF_SIZE];
|
|
|
|
let mut file = File::open(path)?;
|
|
|
|
|
|
|
|
// read a small amount to start with
|
|
|
|
|
2021-10-04 16:12:16 +00:00
|
|
|
let mut read = io::Result::Ok(0);
|
|
|
|
|
|
|
|
for _ in 0..3 {
|
|
|
|
// try to read the file up to 3 times, retrying if interrupted, bailing otherwise
|
|
|
|
file.seek(SeekFrom::Start(0))?;
|
|
|
|
read = file.read(&mut buffer);
|
|
|
|
match read {
|
|
|
|
Ok(_) => break,
|
|
|
|
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
|
|
|
|
Err(_) => break,
|
|
|
|
}
|
|
|
|
}
|
2021-10-04 18:45:05 +00:00
|
|
|
|
2021-10-04 16:12:16 +00:00
|
|
|
let read = read?;
|
|
|
|
let r = db.get_type(&buffer);
|
|
|
|
|
|
|
|
if read < INITIAL_BUF_SIZE {
|
|
|
|
// the file is smaller than INITIAL_BUF_SIZE - there's no point reading it again
|
|
|
|
return Ok(r);
|
|
|
|
}
|
|
|
|
|
|
|
|
let r = r.filter(|mime|
|
2021-11-24 20:29:27 +00:00
|
|
|
// some MIME types should be investigated further, reading up to BUF_SIZE even if they've been determined already
|
2021-08-28 08:09:15 +00:00
|
|
|
// one such type is XML - there's many more specific types that can be determined by reading further (such as SVG)
|
|
|
|
mime != &mime::TEXT_XML
|
|
|
|
// another is ZIP - many file formats (DOCX, ODT, JAR...) are just ZIP files with particular data structures.
|
|
|
|
// determining that a file is in one of the MS office formats in particular requires looking quite far into the
|
|
|
|
// file.
|
2021-11-24 22:45:20 +00:00
|
|
|
&& mime != &*APPLICATION_ZIP
|
2021-08-28 08:09:15 +00:00
|
|
|
// doc/ppt/xls files are a subset of what's known as an "OLE2 compound document storage", at least according to
|
|
|
|
// shared-mime-info. if a pre-OOXML era MS office file is scanned and identified as x-ole-storage, reading further
|
|
|
|
// will allow it to be detected correctly as the appropriate filetype.
|
2021-11-24 22:45:20 +00:00
|
|
|
&& mime != &*APPLICATION_X_OLE_STORAGE);
|
2021-08-28 08:09:15 +00:00
|
|
|
|
|
|
|
if r.is_some() {
|
|
|
|
return Ok(r);
|
|
|
|
}
|
|
|
|
|
2021-10-04 16:12:16 +00:00
|
|
|
// attempt to read up to BUF_SIZE bytes of the file.
|
2021-08-28 08:09:15 +00:00
|
|
|
let mut buffer = [0; BUF_SIZE];
|
|
|
|
file.seek(SeekFrom::Start(0))?;
|
|
|
|
file.read(&mut buffer)?;
|
|
|
|
Ok(db.get_type(&buffer))
|
|
|
|
}
|
|
|
|
|
2021-11-24 20:29:27 +00:00
|
|
|
/// Returns a list of known extensions for this MIME type, if any.
|
2021-10-04 16:12:16 +00:00
|
|
|
/// This function uses the [`Mime`]'s "essence" rather than the [`Mime`] itself - [`mime_guess::get_mime_extensions`]
|
|
|
|
/// ignores the type suffix, treating "image/svg+xml" as "image/svg", and thus fails to find any extensions. Passing the
|
2021-10-04 18:45:05 +00:00
|
|
|
/// `essence_str` (which includes the suffix) fixes this.
|
2021-09-24 12:02:04 +00:00
|
|
|
pub fn mime_extension_lookup(essence: String) -> Option<Vec<String>> {
|
2021-11-22 22:38:43 +00:00
|
|
|
if let Some(exts) = MIMEXT.read().get(&essence) {
|
|
|
|
return exts.clone();
|
2021-09-24 12:02:04 +00:00
|
|
|
}
|
2021-08-28 08:09:15 +00:00
|
|
|
|
2021-09-24 12:02:04 +00:00
|
|
|
let mut exts = mime_guess::get_mime_extensions_str(essence.as_str());
|
|
|
|
if exts.is_none() {
|
|
|
|
// no matches :c
|
|
|
|
// mime_guess' database isn't exactly perfect... there are a lot of times where the db will return "some/x-thing"
|
|
|
|
// but mime_guess only understands "some/thing", or vice-versa.
|
|
|
|
// so, if there appear to be no extensions, try replacing "some/x-thing" with "some/thing", or "some/thing" with
|
|
|
|
// "some/x-thing".
|
|
|
|
if essence.contains("/x-") {
|
|
|
|
// replace e.g. "application/x-gzip" with "application/gzip"
|
|
|
|
exts = mime_guess::get_mime_extensions_str(&essence.replace("/x-", "/"));
|
|
|
|
} else {
|
|
|
|
// replace e.g. "video/mp2t" with "video/x-mp2t"
|
2021-12-17 00:05:04 +00:00
|
|
|
exts = mime_guess::get_mime_extensions_str(&essence.replace('/', "/x-"));
|
2021-08-28 08:09:15 +00:00
|
|
|
}
|
|
|
|
}
|
2021-09-24 12:02:04 +00:00
|
|
|
|
|
|
|
let exts = match exts {
|
|
|
|
Some(exts) => {
|
|
|
|
let possible_exts: Vec<String> = exts.iter().map(|e| String::from(*e)).collect();
|
|
|
|
|
|
|
|
Some(if essence == mime::IMAGE_JPEG.essence_str() {
|
|
|
|
// possible_exts starts with "jpe", because it's alphabetically before "jpeg" and "jpg". however, jpg/jpeg are
|
|
|
|
// far more common than jpe, so it makes sense to suggest one of those rather than jpe. to do this, we can
|
|
|
|
// add "jpg" to the start of the possible_exts list, ensuring that it will be the extension suggested by fif.
|
|
|
|
[vec![String::from("jpg")], possible_exts].concat()
|
|
|
|
} else if essence == mime::TEXT_XML.essence_str() || essence == "application/xml" {
|
|
|
|
// a somewhat similar case arises with XML files - the first suggested extension is "asa", when it should
|
|
|
|
// (in my opinion) be "xml".
|
|
|
|
// there's also another problem: SVG files can easily be misidentified as XML files, because they usually
|
|
|
|
// *are* valid XML - the more whitespace and comments an SVG file begins with, the more bytes must be read
|
|
|
|
// before it's possible to determine that it's an SVG rather than an XML file. to "fix" this, we can add "svg"
|
|
|
|
// as a valid extension for XML files, ensuring that SVG files misidentified as XML will still be considered
|
|
|
|
// to have valid extensions.
|
|
|
|
// TODO: if a file is detected as application/xml, but it has an extension like "xht" which corresponds to
|
|
|
|
// "application/xhtml+xml", let it through - in other words, if it's identified as application/xml, but its
|
|
|
|
// extension is classed as application/*+xml, consider it OK
|
|
|
|
[vec![String::from("xml"), String::from("svg")], possible_exts].concat()
|
|
|
|
} else if essence == "application/msword" {
|
|
|
|
// classic office files considered harmful
|
|
|
|
vec![String::from("doc"), String::from("xls"), String::from("ppt")]
|
|
|
|
} else if essence == "application/zip" {
|
2022-05-02 07:55:48 +00:00
|
|
|
// both backends seem to be unable to consistently detect OOXML files, so they should be considered valid
|
|
|
|
// extensions for zip files to prevent them being erroneously renamed.
|
|
|
|
// additionally, there are various popular formats that are just renamed zip files, such as android's apk
|
|
|
|
// format, that also shouldn't be renamed.
|
2021-09-24 12:02:04 +00:00
|
|
|
[
|
2022-05-02 07:55:48 +00:00
|
|
|
vec![
|
|
|
|
String::from("zip"),
|
|
|
|
String::from("docx"),
|
|
|
|
String::from("xlsx"),
|
|
|
|
String::from("pptx"),
|
|
|
|
String::from("apk"),
|
|
|
|
String::from("ipa"),
|
|
|
|
String::from("docbook"),
|
|
|
|
String::from("kdenlive"),
|
|
|
|
String::from("vcpkg"),
|
|
|
|
String::from("nupkg"),
|
|
|
|
String::from("whl"),
|
|
|
|
String::from("xpi"),
|
|
|
|
],
|
2021-09-24 12:02:04 +00:00
|
|
|
possible_exts,
|
|
|
|
]
|
|
|
|
.concat()
|
|
|
|
} else if essence == "application/x-ms-dos-executable" {
|
2022-05-02 07:55:48 +00:00
|
|
|
// .dll, .exe, .scr, etc. files are given the same MIME type, and aren't really distinguishable from each other
|
|
|
|
// ... but you definitely don't want to rename one to the other!
|
|
|
|
[
|
|
|
|
vec![
|
|
|
|
String::from("exe"),
|
|
|
|
String::from("dll"),
|
|
|
|
String::from("scr"),
|
|
|
|
String::from("com"),
|
|
|
|
String::from("dll16"),
|
|
|
|
String::from("drv"),
|
|
|
|
String::from("drv16"),
|
|
|
|
String::from("cpl"),
|
|
|
|
String::from("msstyles"),
|
|
|
|
String::from("sys"),
|
|
|
|
],
|
|
|
|
possible_exts,
|
|
|
|
]
|
|
|
|
.concat()
|
2021-09-24 12:02:04 +00:00
|
|
|
} else {
|
|
|
|
possible_exts
|
|
|
|
})
|
|
|
|
}
|
|
|
|
None => None,
|
|
|
|
};
|
|
|
|
|
2021-11-22 22:38:43 +00:00
|
|
|
MIMEXT.write().insert(essence, exts.clone());
|
|
|
|
exts
|
2021-08-28 08:09:15 +00:00
|
|
|
}
|