albumcutter

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

albumcutter.py (4979B)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
""""
 AlbumCutter is a program that uses `youtube-dl` and `pydub` to download albums
 off of youtube and break them into individual files under a specified directory

 This class provides the functions to
  * Obtain the file
  * Parse track listing
  * From the track listing, splice the audio file into individual tracks and
    export them into the filesystem

 AlbumCutter is intialized with the URL of the youtube vdieo, followed by a
 string containing the tracklist (delimited by newlines), along with the output
 directory.

 Copyright (c) Paul Longtine <paul@nanner.co>
"""
import re, os, sys
from subprocess import call, Popen, PIPE
from pydub import AudioSegment

class AlbumCutter:
	def __init__( self, url, tracklist, output, resume=False ):
		self.tracklist = tracklist
		self.url       = url
		self.output    = output
		self.fname     = None
		self.audio     = None
		self.tracks    = None
		
		# Make the directory
		if os.path.exists(output) and not resume:
			print("ERROR: Directory exists, aborting")
			sys.exit(1)

		try:
			if not resume:
				os.mkdir(output)
		except:
			print("ERROR: Could not make directory ({})".format(output))
			sys.exit(1)

		if not resume:
			self.download_audio(self.url)

		self.load_audio(self.url)
		self.process_tracklist(self.tracklist)
		self.export(self.output)

	# loads the audio
	def load_audio( self, url ):
		self.fname = self.get_filename(url)
		# Loads file into the fancy manipulator thingmajig I found on the web
		print("Trying to load file ({})".format(self.fname))
		try:
			#TODO actually figure out proper encoding memnonic for this function
			#     Currently, it just assumes the extension IS the memnonic...
			#                ...which is bad. We don't want that.
			self.audio = AudioSegment.from_file(self.fname,
			                                    self.fname.split(".")[1])
		except Exception, e:
			print("ERROR: Could not load audio\n{}".format(e))
			sys.exit(1)

		print("Loaded audio")

	# fetches audio from URL.
	def download_audio( self, url ):
		print("Downloading audio...")
		# youtube-dl -q -x -o$(output)/$(VIDEO ID).$(EXTENSION)
		call(['youtube-dl', '-q', '-x',
		      '-o{}/%(id)s.%(ext)s'.format(self.output), url])

	# gets the filename
	def get_filename( self, url ):
		# This call here finds the filename for the audio just downloaded
		# It's not elegant at all, and it was the best solution I could find.
		# The name of the file ends up in `output` with a newline at the end.
		print("Determining filename...")
		p = Popen(['youtube-dl', '--get-filename', '--skip-download',
		           '-x', '-o{}/%(id)s.%(ext)s'.format(self.output), url],
		           stdin=PIPE, stdout=PIPE, stderr=PIPE)
		output, err = p.communicate()
		#removes pesky newline, if it exists
		output = output[:-1] if output[-1] == "\n" else output

		return output

	# gives meaning to tracklist using format:
	# [ { 'start': <starting time>, 'end': <ending time>, 'title': <title> }, ]
	def process_tracklist( self, tracklist):
		tracks = []
		# Regex for finding hour, minute, and second. I am proud of this.
		rt = re.compile(r'((?P<hr>\d+):)?((?P<min>\d+):)(?P<sec>\d+)')

		track_names = []
		track_num = 0
		for track in tracklist.split("\n"):
			# Finds hour, minute and second and converts it into miliseconds
			times = rt.search(track)
			if times == None:
				continue
			ms = self.to_ms(times.group('hr'),
			                times.group('min'),
			                times.group('sec'))
			# Remove timestamp and prettify the title
			title = rt.sub('', track)
			title = re.sub('[^-a-zA-Z0-9_.() ]+', '', title)
			title = title.strip()
			if title == "":
				title = "Track {}".format(track_num)

			# Enforces uniqueness
			if title in track_names:
				title = title + " {}".format(track_num)

			# Get prevous track and set the end to this start time
			if len(tracks) > 0:
				if type(tracks[-1]) == dict:
					tracks[-1]["end"] = ms
			tracks.append({"start":ms,
			               "end":-1,
			               "title":title})
			track_num += 1

		self.tracks = tracks

	# Converts hours, minutes, and seconds into ms and adds them into one number
	def to_ms( self, hour, minute, second ):
		hour   = 0 if hour == None else int(hour)
		minute = 0 if minute == None else int(minute)
		try:
			second = int(second)
		except:
			return(None)
		#             ms in hour        ms in minute        ms in second
		return((hour * 3600000) + (minute * 60000) + (second * 1000))

	# Exports tracks into specified directory
	def export( self, directory ):
		assert(self.tracks != None and self.audio != None)
		# Loop through tracks and process them
		for track in self.tracks:
			print("Processing track: '{}'".format(track['title']))
			t = self.audio[track['start']:track['end']]
			path = os.path.join(directory, "{}.mp3".format(track['title']))
			try:
				t.export(path, format="mp3")
			except:
				print("ERROR: Could not export ({})".format(path))

	def __del__( self ):
		del self.audio