fairly major restructuring that should make future format support a lot easier, support for songs with partially or fully incomplete metadata

2020-10-17 23:54:50 +10:00 · 2020-10-17 23:54:50 +10:00 · 5b95996d8c
commit 5b95996d8c
parent f78c8d7c78
1 changed files with 199 additions and 116 deletions
--- a/bcao.py
+++ b/bcao.py
@ -6,6 +6,7 @@
 # output: it organises it, adds cover art, puts it in the right place...

 import argparse
+import io
 import os
 import re
 import sys
@ -30,22 +31,112 @@ from mutagen.id3 import APIC, PictureType

 from PIL import Image

-vorbis_to_id3: Dict[str, str] = {
-	"track": "TRCK",
-	"artist": "TPE1",
-	"title": "TIT2",
-	"album": "TALB",
-	"album_artist": "TPE2"
-}
-vorbis_to_itunes: Dict[str, str] = {
-	"track": 'trkn',
-	"artist": '\xa9ART',
-	"title": '\xa9nam',
-	"album": '\xa9alb',
-	"album_artist": 'aART'
-}
-fully_supported: List[str] = ["ogg", "flac", "mp3", "m4a"]
+fully_supported: List[str] = ["ogg", "flac", "mp3", "m4a", "wav"]
 MutagenFile = Union[MP3, FLAC, OggVorbis, mutagen.FileType]
+MutagenTags = Union[mutagen.id3.ID3Tags, mutagen.mp4.Tags, mutagen.oggvorbis.OggVCommentDict]
+
+class SongInfo:
+	tag_lookup: Dict[str, Dict[str, str]] = {
+		"track":        {"id3": "TRCK", "m4a": "trkn", "vorbis": "tracknumber"},
+		"artist":       {"id3": "TPE1", "m4a": "©ART", "vorbis": "artist"},
+		"title":        {"id3": "TIT2", "m4a": "©nam", "vorbis": "title"},
+		"album":        {"id3": "TALB", "m4a": "©alb", "vorbis": "album"},
+		"album_artist": {"id3": "TPE2", "m4a": "aART", "vorbis": "albumartist"}
+	}
+	format_lookup: Dict[str, str] = {
+		"mp3": "id3",
+		"m4a": "m4a",
+		"ogg": "vorbis",
+		"flac": "vorbis"
+	}
+
+	def __init__(self, file_name: Path):
+		self.m_file: MutagenFile = mutagen.File(file_name)
+		self.m_tags: MutagenTags = self.m_file.tags
+		self.file_name = str(file_name.name)
+		self.format = path.splitext(file_name)[1][1:]
+		self.fallback = False
+
+		fallbacks = re.match(r"^(.+) - (.+) - (\d{2,}) (.+)\.(?:ogg|flac|alac|aiff|wav|mp3|m4a)$", self.file_name)
+		# set default values for the tags, in case the file is missing any (or all!) of them
+		self.tags: Dict[str, str] = {
+			"track": fallbacks.group(3),
+			"artist": fallbacks.group(1),
+			"title": fallbacks.group(4),
+			"album": fallbacks.group(2),
+			"album_artist": fallbacks.group(1)
+		}
+		# set list_tags to the default tags in list form
+		# i.e. for every tag, set list_tags[x] = [tags[x]]
+		self.list_tags: Dict[str, List[str]] = dict((x[0], [x[1]]) for x in self.tags.items())
+
+		if self.m_tags is None:
+			self.fallback = True
+
+		else:
+			for standard_name, tag_set in self.tag_lookup.items():
+				tag = tag_set[self.format_lookup[self.format]]
+
+				if tag not in self.m_tags:
+					print(f"{tag} not in self.m_tags")
+					self.fallback = True
+					continue
+
+				value_list = self.m_tags[tag]
+				if self.format == "m4a" and tag == "track":
+					# every tag in the MP4 file (from what i can tell) is a list
+					# this includes the track number tag, which is a tuple of ints in a list.
+					# because every other format is either a non-list, or a list of non-lists, we need to account for this case
+					# (a list of lists of non-lists) specially, by turning it into a list of non-lists.
+					value_list = value_list[0]
+
+				if not isinstance(value_list, (list, tuple)):
+					value_list = [value_list]
+
+				# convert the list of strings/ID3 frames/ints/whatevers to sanitised strings
+				value_list = [sanitise(str(val)) for val in value_list]
+
+				self.tags[standard_name] = value_list[0]
+				self.list_tags[standard_name] = value_list
+
+	def get_target_name(self, zeroes: int):
+		return f"{self.tags['track'].zfill(zeroes)} {self.tags['title']}.{self.format}"
+
+	def has_cover(self):
+		if self.format == "ogg":
+			return "metadata_block_picture" in self.m_tags and len(self.m_tags["metadata_block_picture"]) != 0
+
+		if self.format == "flac":
+			return len(self.m_file.pictures) != 0
+
+		if self.format == "mp3":
+			apics: List[APIC] = self.m_tags.getall("APIC")
+			for apic in apics:
+				if apic.type == PictureType.COVER_FRONT:
+					return True
+			return False
+
+		if self.format == "m4a":
+			return 'covr' in self.m_tags and len(self.m_tags['covr']) != 0
+
+		raise NotImplementedError("Song format not yet implemented.")
+
+	def set_cover(self, embed_cover: Union[Picture, APIC, MP4Cover]):
+		# embed cover art
+		if self.format == "ogg":
+			self.m_tags["metadata_block_picture"] = [b64encode(embed_cover.write()).decode("ascii")]
+		elif self.format == "flac":
+			self.m_file.clear_pictures()
+			self.m_file.add_picture(embed_cover)
+		elif self.format == "mp3":
+			self.m_tags.add(embed_cover)
+		elif self.format == "m4a":
+			self.m_tags['covr'] = [embed_cover]
+
+		self.m_file.save()
+
+	def __getitem__(self, item):
+		return self.tags[item]

 def log(message: str, importance: int = 0):
 	if not args.quiet or importance > 0:
@ -55,42 +146,42 @@ def die(message: str, code: int = 1):
 	print(message)
 	sys.exit(code)

-def get_tag(mut_song: MutagenFile, tag: str, allow_list: bool = False, allow_sanitising: bool = True)\
-		-> Union[str, List[str]]:
-	if isinstance(mut_song, MP3):
-		tag = vorbis_to_id3[tag]
-		tag_list = mut_song.tags.getall(tag)
-
-	elif isinstance(mut_song, MP4):
-		# every tag in the MP4 file (from what i can tell) is a list
-		# this includes the track number tag, which is a list, containing a single tuple, containing two ints (track, total)
-		# unless we account for this, tag_list will be set to [(1, 5)], and then converted to a string, resulting in
-		#  ['(1, 5)'], which (if not allow_list) will be returned as '(1, 5)', which is not exactly helpful.
-		tag = vorbis_to_itunes[tag]
-		if tag == 'trkn':
-			# mut_song[tag] == [(1, 5)]
-			# mut_song[tag][0] == (1, 5)
-			tag_list = mut_song[tag][0]
-		else:
-			tag_list = mut_song[tag]
-
-	else:
-		if tag == "track":
-			tag = "tracknumber"
-		tag = tag.replace("_", "")
-		tag_list = mut_song[tag] if isinstance(mut_song[tag], list) else [mut_song[tag]]
-
-	# convert the list of strings/ID3 frames/ints/whatevers to strings
-	tag_list = list(map(str, tag_list))
-
-	# sanitise everything
-	if allow_sanitising:
-		tag_list = [sanitise(tag) for tag in tag_list]
-
-	if allow_list:
-		return tag_list
-
-	return tag_list[0]
+# def get_tag(mut_song: MutagenFile, tag: str, allow_list: bool = False, allow_sanitising: bool = True)\
+# 		-> Union[str, List[str]]:
+# 	if isinstance(mut_song, MP3):
+# 		tag = vorbis_to_id3[tag]
+# 		tag_list = mut_song.tags.getall(tag)
+#
+# 	elif isinstance(mut_song, MP4):
+# 		# every tag in the MP4 file (from what i can tell) is a list
+# 		# this includes the track number tag, which is a list, containing a single tuple, containing two ints (track, total)
+# 		# unless we account for this, tag_list will be set to [(1, 5)], and then converted to a string, resulting in
+# 		#  ['(1, 5)'], which (if not allow_list) will be returned as '(1, 5)', which is not exactly helpful.
+# 		tag = vorbis_to_itunes[tag]
+# 		if tag == 'trkn':
+# 			# mut_song[tag] == [(1, 5)]
+# 			# mut_song[tag][0] == (1, 5)
+# 			tag_list = mut_song[tag][0]
+# 		else:
+# 			tag_list = mut_song[tag]
+#
+# 	else:
+# 		if tag == "track":
+# 			tag = "tracknumber"
+# 		tag = tag.replace("_", "")
+# 		tag_list = mut_song[tag] if isinstance(mut_song[tag], list) else [mut_song[tag]]
+#
+# 	# convert the list of strings/ID3 frames/ints/whatevers to strings
+# 	tag_list = list(map(str, tag_list))
+#
+# 	# sanitise everything
+# 	if allow_sanitising:
+# 		tag_list = [sanitise(tag) for tag in tag_list]
+#
+# 	if allow_list:
+# 		return tag_list
+#
+# 	return tag_list[0]

 def has_cover(mut_song: MutagenFile):
 	if isinstance(mut_song, OggVorbis):
@ -120,8 +211,10 @@ def sanitise(in_str: str) -> str:
 parser = argparse.ArgumentParser(usage='%(prog)s zip [options]',
 	formatter_class=argparse.RawTextHelpFormatter,
 	description="Extracts the given zip file downloaded from Bandcamp and organises it.",
-	epilog=f"Cover art can only be embedded in files of the following types: {', '.join(fully_supported).upper()}.\nIf "
-	"the song is in any other format, %(prog)s will behave as though you passed '-c n', but will otherwise work normally.")
+	epilog=f"Cover art can only be embedded in files of the following types: {', '.join(fully_supported).upper()}.\n"
+"If the song is in any other format, %(prog)s will behave as though you passed '-c n', "
+"but will otherwise work normally.\nIf the song files contain no metadata, %(prog)s will attempt "
+"to parse the song's filenames to retrieve the artist, album, title, and track number.")
 parser.add_argument('zip', help='The zip file to use.')
 parser.add_argument('-c', '--add-cover-images', dest='process_cover', default='w', choices=['n', 'a', 'w'],
 	help="When to embed cover art into songs.\nOptions: [n]ever, [a]lways, [w]hen necessary.\nDefault: %(default)s")
@ -149,7 +242,7 @@ song_names: List[str] = []

 with ZipFile(args.zip, 'r') as zip_file:
 	for file in zip_file.namelist():
-		if re.match(r"^(.+ - ){2}\d{2,} .+\.(ogg|flac|alac|aiff|wav|mp3|opus|m4a|aac)$", file):
+		if re.match(r"^(.+ - ){2}\d{2,} .+\.(ogg|flac|alac|aiff|wav|mp3|m4a)$", file):
 			# bandcamp zips contains songs with names formatted like "Album - Artist - 01 Song.mp3"
 			# for example, "King Crimson - In the Wake of Poseidon - 02 Pictures of a City.ogg"
 			# this regex should match only on those, and cut out (hopefully) all of the bonus material stuff, which shouldn't
@ -198,21 +291,21 @@ if args.process_cover != 'n':
 	with open(temp_cover, 'r+b') as cover_file:
 		data = cover_file.read()

-	with Image.open(temp_cover) as image:
-		# it's really strange that the more annoying the file's metadata is, the *less* annoying it is to create cover art
-		#  for it in mutagen.
-		# vorbis: open standard, so easy to use that mutagen supplies a bunch of "easy" wrappers around other formats to
-		#  make them work more like mutagen.
-		#  cover-annoy-o-meter: high. mutagen requires you to specify the width, height, colour depth, etc etc
-		# id3: well documented, but rather cryptic (which is more understandable, "album_artist" or "TPE2").
-		#  cover-annoy-o-meter: not bad at all - at least you get a constructor this time - although it is kinda annoying
-		#  that you have to specify the file encoding, and how you need both a type and a desc.
-		# m4a: scarce documentation, closed format, half reverse engineered from whatever itunes is doing, exists pretty
-		#  much exclusively in the realm of apple stuff.
-		#  cover-annoy-o-meter: all you need is the file data and the format type.
+	# it's really strange that the more annoying the file's metadata is, the *less* annoying it is to create cover art
+	#  for it in mutagen.
+	# vorbis: open standard, so easy to use that mutagen supplies a bunch of "easy" wrappers around other formats to
+	#  make them work more like mutagen.
+	#  cover-annoy-o-meter: high. mutagen requires you to specify the width, height, colour depth, etc etc
+	# id3: well documented, but rather cryptic (which is more understandable, "album_artist" or "TPE2").
+	#  cover-annoy-o-meter: not bad at all - at least you get a constructor this time - although it is kinda annoying
+	#  that you have to specify the file encoding, and how you need both a type and a desc.
+	# m4a: scarce documentation, closed format, half reverse engineered from whatever itunes is doing, exists pretty
+	#  much exclusively in the realm of apple stuff.
+	#  cover-annoy-o-meter: all you need is the file data and the format type.

-		if song_format in ["ogg", "flac"]:
-			# i hate this
+	if song_format in ["ogg", "flac"]:
+		# i hate this
+		with Image.open(io.BytesIO(data)) as image:
 			embed_cover = Picture()
 			embed_cover.data = data
 			embed_cover.type = PictureType.COVER_FRONT
@ -220,35 +313,37 @@ if args.process_cover != 'n':
 			embed_cover.width = image.size[0]
 			embed_cover.height = image.size[1]
 			embed_cover.depth = image.bits
-		elif song_format == "mp3":
-			# apparently APIC files get compressed on save if they are "large":
-			# https://mutagen.readthedocs.io/en/latest/api/id3_frames.html#mutagen.id3.APIC
-			# i don't know what that means (lossless text compression? automatic JPEG conversion?) and i don't know if or how
-			# i can disable it, which kinda sucks...
-			# if, for example, mutagen's threshold for "large" is 200KiB, then any file over that size would be reduced to
-			# below it, either by resizing or JPEG quality reduction or whatever, making the -t flag useless for values above
-			#  200 when saving MP3 files.
-			# the most i can tell is that mutagen uses zlib compression in some way or another for reading ID3 tags:
-			# https://github.com/quodlibet/mutagen/blob/release-1.45.1/mutagen/id3/_frames.py#L265
-			# however, it seems not to use zlib when *writing* tags, citing itunes incompatibility, in particular with APIC:
-			# https://github.com/quodlibet/mutagen/blob/release-1.45.1/mutagen/id3/_tags.py#L510
-			# given that this is the only reference to compression that i could find in the source code, and it says that
-			#  ID3v2 compression was disabled for itunes compatibility, i'm going to assume/hope it doesn't do anything weird.
-			# it's worth noting that mutagen has no dependencies outside of python's stdlib, which (currently) doesn't contain
-			# any method for JPEG compression, so i'm 99% sure the files won't be mangled.

-			embed_cover = APIC(
-				encoding=3, # utf-8
-				mime="image/jpeg",
-				type=PictureType.COVER_FRONT,
-				desc='cover',
-				data=data
-			)
-		elif song_format == "m4a":
-			embed_cover = MP4Cover(
-				data=data,
-				imageformat=MP4Cover.FORMAT_JPEG
-			)
+	elif song_format == "mp3":
+		# apparently APIC files get compressed on save if they are "large":
+		# https://mutagen.readthedocs.io/en/latest/api/id3_frames.html#mutagen.id3.APIC
+		# i don't know what that means (lossless text compression? automatic JPEG conversion?) and i don't know if or how
+		# i can disable it, which kinda sucks...
+		# if, for example, mutagen's threshold for "large" is 200KiB, then any file over that size would be reduced to
+		# below it, either by resizing or JPEG quality reduction or whatever, making the -t flag useless for values above
+		#  200 when saving MP3 files.
+		# the most i can tell is that mutagen uses zlib compression in some way or another for reading ID3 tags:
+		# https://github.com/quodlibet/mutagen/blob/release-1.45.1/mutagen/id3/_frames.py#L265
+		# however, it seems not to use zlib when *writing* tags, citing itunes incompatibility, in particular with APIC:
+		# https://github.com/quodlibet/mutagen/blob/release-1.45.1/mutagen/id3/_tags.py#L510
+		# given that this is the only reference to compression that i could find in the source code, and it says that
+		#  ID3v2 compression was disabled for itunes compatibility, i'm going to assume/hope it doesn't do anything weird.
+		# it's worth noting that mutagen has no dependencies outside of python's stdlib, which (currently) doesn't contain
+		# any method for JPEG compression, so i'm 99% sure the files won't be mangled.
+
+		embed_cover = APIC(
+			encoding=3, # utf-8
+			mime="image/jpeg",
+			type=PictureType.COVER_FRONT,
+			desc='cover',
+			data=data
+		)
+
+	elif song_format == "m4a":
+		embed_cover = MP4Cover(
+			data=data,
+			imageformat=MP4Cover.FORMAT_JPEG
+		)

 artists: List[str] = []
 album: Optional[str] = None
@ -256,32 +351,20 @@ songs: Dict[str, str] = {}
 zeroes = min(len(song_names), 2)
 first_loop: bool = True

-for song in song_names:
-	m: MutagenFile = mutagen.File(Path(tmp, song))
+for song_name in song_names:
+	song = SongInfo(Path(tmp, song_name))
 	if first_loop:
 		# the first item in the artists list should be the album artist
-		artists.append(get_tag(m, "album_artist"))
-		album = get_tag(m, "album")
+		artists.append(song["album_artist"])
+		album = song["album"]
 		first_loop = False

 	# add the song's artist(s) to the list
-	map(artists.append, get_tag(m, "artist", allow_list=True))
-	songs[song] = f"{str(get_tag(m, 'track')).zfill(zeroes)} {get_tag(m, 'title')}.{song_format}"
+	map(artists.append, song.list_tags["artist"])
+	songs[song_name] = song.get_target_name(zeroes)

-	if args.process_cover == 'a' or (args.process_cover == 'w' and has_cover(m) is False):
-		log("Embedding cover art...")
-		# embed cover art
-		if song_format == "ogg":
-			m["metadata_block_picture"] = [b64encode(embed_cover.write()).decode("ascii")]
-		elif song_format == "flac":
-			m.clear_pictures()
-			m.add_picture(embed_cover)
-		elif song_format == "mp3":
-			m.tags.add(embed_cover)
-		elif song_format == "m4a":
-			m['covr'] = [embed_cover]
-
-		m.save()
+	if args.process_cover == 'a' or (args.process_cover == 'w' and song.has_cover() is False):
+		song.set_cover(embed_cover)

 # remove duplicate artists
 artists = list(dict.fromkeys(artists))