Weird script python problem

I have an interesting case: I have a python code that returns the transcript from an YouTube video. The transcript is returned if I run in my Windmill installed using docker-compose in my local machine - I used the commands from github. In the windmill SAAS it is not working - it shows the message that the video doesn't have the transcript (but it does). In my VPS installation is not working as well, showing the same error as the SAAS - I installed using the latest version today - 10/23/24 The pictures show Python Code below in comment
No description
No description
No description
3 Replies
Elber Domingos
Python code:
from youtube_transcript_api import YouTubeTranscriptApi
import re
from typing import List, Dict, Union


# Function to extract video ID from YouTube URL
def get_video_id(url: str) -> Union[str, None]:
patterns = [
r"(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)",
r"(?:https?:\/\/)?(?:www\.)?youtu\.be\/([^?]+)",
r"(?:https?:\/\/)?(?:www\.)?youtube\.com\/embed\/([^?]+)",
r"(?:https?:\/\/)?(?:www\.)?youtube\.com\/v\/([^?]+)",
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None


# Function to fetch YouTube transcript in specified languages
def get_youtube_transcript(url: str, languages: List[str]) -> Union[str, Dict]:
video_id = get_video_id(url)
if not video_id:
return {"error": "Invalid YouTube URL"}

try:
# Fetch transcript in the specified languages
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=languages)
except Exception as e:
return {
"error": str(e),
"message": f"Could not retrieve a transcript for the video ID {video_id}. "
f"Subtitles might be disabled or not available in the specified languages.",
}

# Join all transcript entries into a single string
return " ".join([entry["text"] for entry in transcript])
from youtube_transcript_api import YouTubeTranscriptApi
import re
from typing import List, Dict, Union


# Function to extract video ID from YouTube URL
def get_video_id(url: str) -> Union[str, None]:
patterns = [
r"(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)",
r"(?:https?:\/\/)?(?:www\.)?youtu\.be\/([^?]+)",
r"(?:https?:\/\/)?(?:www\.)?youtube\.com\/embed\/([^?]+)",
r"(?:https?:\/\/)?(?:www\.)?youtube\.com\/v\/([^?]+)",
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None


# Function to fetch YouTube transcript in specified languages
def get_youtube_transcript(url: str, languages: List[str]) -> Union[str, Dict]:
video_id = get_video_id(url)
if not video_id:
return {"error": "Invalid YouTube URL"}

try:
# Fetch transcript in the specified languages
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=languages)
except Exception as e:
return {
"error": str(e),
"message": f"Could not retrieve a transcript for the video ID {video_id}. "
f"Subtitles might be disabled or not available in the specified languages.",
}

# Join all transcript entries into a single string
return " ".join([entry["text"] for entry in transcript])
Rest of the code:
# Main function for Windmill.dev
def main(
url: str, with_timestamps: bool = False, languages: List[str] = ["en", "pt"]
) -> Union[str, Dict]:
# Ensure the languages list includes only 'en' and 'pt'
languages = [lang for lang in languages if lang in ["en", "pt"]]

transcript = get_youtube_transcript(url, languages)

if isinstance(transcript, dict) and "error" in transcript:
return transcript

# If timestamps aren't needed, return just the text
if not with_timestamps:
return transcript

# Return the full transcript if timestamps are needed
return transcript
# Main function for Windmill.dev
def main(
url: str, with_timestamps: bool = False, languages: List[str] = ["en", "pt"]
) -> Union[str, Dict]:
# Ensure the languages list includes only 'en' and 'pt'
languages = [lang for lang in languages if lang in ["en", "pt"]]

transcript = get_youtube_transcript(url, languages)

if isinstance(transcript, dict) and "error" in transcript:
return transcript

# If timestamps aren't needed, return just the text
if not with_timestamps:
return transcript

# Return the full transcript if timestamps are needed
return transcript
rubenf
rubenf7d ago
I think it's youtube filtering those ips basically google detecting that those ips are aws based and blocking them
Elber Domingos
Uhm. I didn’t think about it. I have a proxy server. I will try it and write back