import io
import logging
import os
import subprocess
from googleapiclient.http import MediaIoBaseDownload
from .youtube_api import init_youtube
logger = logging.getLogger(__name__)
[docs]class TranscriptDownloader:
"""Download transcripts from YouTube using the YouTube API or ``youtube-dl``."""
def __init__(self, youtube=None, ytdl=True):
self.ytdl = ytdl
if youtube is None and not ytdl:
self.youtube = init_youtube(oauth=True)
else:
self.youtube = youtube
[docs] @staticmethod
def check_suffix(output_path):
"""
Gets the file extension from ``output_path`` and verifies it is either ".srt", ".vtt", or it is not present in ``output_path``.
The default is ".vtt".
"""
sub_format = output_path.suffix[1:]
if output_path.suffix == "":
output_path = output_path.with_suffix(".vtt")
sub_format = "vtt"
elif output_path.suffix != ".srt" and output_path.suffix != ".vtt":
raise Exception(
"Only .srt and .vtt files are supported. You tried to create a "
+ output_path.suffix
+ " file."
)
return output_path, sub_format
[docs] def get_transcript_ytdl(self, video_id, output_path):
"""
Gets the transcript for ``video_id`` using ``youtube-dl`` and saves it to ``output_path``.
The extension from ``output_path`` will be the ``--sub-format`` that is passed to the ``youtube-dl`` command.
"""
def run_command(command_array):
completed_command = subprocess.run(
command_array, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
output = completed_command.stdout.decode("utf-8")
errors = completed_command.stderr.decode("utf-8")
return output, errors
output_path, sub_format = self.check_suffix(output_path)
output_path_no_extension = os.path.splitext(output_path)[0]
command_array = [
"youtube-dl",
"--sub-lang",
"en",
"--sub-format",
sub_format,
"--write-sub",
"--skip-download",
"-o",
str(output_path_no_extension),
video_id,
]
output, errors = run_command(command_array)
tries = 1
while (
"video is unavailable" in errors or "Unable to download webpage" in errors
) and tries < 3:
output, errors = run_command(command_array)
tries += 1
if tries == 3:
logger.warn("YouTube timed out while getting " + video_id)
return None
if "WARNING: video doesn't have subtitles" in errors:
logger.warn(
video_id
+ " does not contain a subtitle file for the specified language and format."
)
return None
if " " in errors or "ERROR" in errors or "WARNING" in errors:
logger.info("The youtube-dl command returned the following error message:")
logger.error(errors)
return None
# remove the ".en" that youtube-dl adds
os.rename((output_path_no_extension + ".en." + sub_format), output_path)
return output_path
[docs] def get_transcript_api(self, caption_id, output_path):
"""Downloads a caption track by id directly from the YouTube API.
Args:
caption_id (str): the id of the caption track to download
output_path (str): path to save the captions. file extensions are parsed by :meth:`~lecture2notes.end_to_end.transcript_downloader.check_suffix`
Returns:
[str]: the path where the transcript was saved (may not be the same as the ``output_path`` parameter)
"""
output_path, sub_format = self.check_suffix(output_path)
request = self.youtube.captions().download(id=caption_id, tfmt=sub_format)
fh = io.FileIO(output_path, "wb")
download = MediaIoBaseDownload(fh, request)
complete = False
while not complete:
status, complete = download.next_chunk()
self.transcript_path = output_path
return output_path
[docs] def get_caption_id(self, video_id, lang="en"):
"""Gets the caption id with language ``land`` for a video on YouTube with id ``video_id``."""
request = self.youtube.captions().list(part="snippet", videoId=video_id)
response = request.execute()
for caption in response["items"]:
trackKind = caption["snippet"]["trackKind"]
language = caption["snippet"]["language"]
caption_id = caption["id"]
if trackKind == "standard" and language == lang:
self.caption_id = caption_id
return caption_id
raise Exception("No caption track exists for language '" + lang + "'.")
[docs] def download(self, video_id, output_path):
"""
Convenience function to download transcript with one call.
If ``self.ytdl`` is False, calls :meth:`~lecture2notes.end_to_end.transcript_downloader.TranscriptDownloader.get_caption_id` and passes result to :meth:`~lecture2notes.end_to_end.transcript_downloader.TranscriptDownloader.get_transcript`.
If ``self.ytdl`` is True, calls :meth:`~lecture2notes.end_to_end.transcript_downloader.TranscriptDownloader.get_transcript_ytdl`.
"""
if self.ytdl:
output_path = self.get_transcript_ytdl(video_id, output_path)
else:
caption_id = self.get_caption_id(video_id)
output_path = self.get_transcript_api(caption_id, output_path)
return output_path
# downloader = TranscriptDownloader()
# output_path = Path("test.srt")
# transcript_path = downloader.download("Vss3nofHpZI", output_path)