from youtube_transcript_api import YouTubeTranscriptApi
import re
from typing import List, Dict, Union
# Function to extract video ID from YouTube URL
def get_video_id(url: str) -> Union[str, None]:
patterns = [
r"(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)",
r"(?:https?:\/\/)?(?:www\.)?youtu\.be\/([^?]+)",
r"(?:https?:\/\/)?(?:www\.)?youtube\.com\/embed\/([^?]+)",
r"(?:https?:\/\/)?(?:www\.)?youtube\.com\/v\/([^?]+)",
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None
# Function to fetch YouTube transcript in specified languages
def get_youtube_transcript(url: str, languages: List[str]) -> Union[str, Dict]:
video_id = get_video_id(url)
if not video_id:
return {"error": "Invalid YouTube URL"}
try:
# Fetch transcript in the specified languages
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=languages)
except Exception as e:
return {
"error": str(e),
"message": f"Could not retrieve a transcript for the video ID {video_id}. "
f"Subtitles might be disabled or not available in the specified languages.",
}
# Join all transcript entries into a single string
return " ".join([entry["text"] for entry in transcript])