Elber Domingos Comments

Elber Domingos

Posts Comments

WWindmill

•Created by Elber Domingos on 10/23/2024 in #help

Weird script python problem

that is cool. Thank you - I will use it for sure

12 replies

WWindmill

•Created by Elber Domingos on 10/23/2024 in #help

Weird script python problem

Code for transcript with proxy

12 replies

WWindmill

•Created by Elber Domingos on 10/23/2024 in #help

Weird script python problem

I am using Webshare to get a proxy, and the code is below. It worked, but sometimes it gets an IP that is blocked, and stop working. You could get a residencial proxy - more expensive - but it would work probably always.

12 replies

WWindmill

•Created by Elber Domingos on 10/23/2024 in #help

Weird script python problem

Uhm. I didn’t think about it. I have a proxy server. I will try it and write back

12 replies

WWindmill

•Created by Elber Domingos on 10/23/2024 in #help

Weird script python problem

Rest of the code:

# Main function for Windmill.dev
def main(
    url: str, with_timestamps: bool = False, languages: List[str] = ["en", "pt"]
) -> Union[str, Dict]:
    # Ensure the languages list includes only 'en' and 'pt'
    languages = [lang for lang in languages if lang in ["en", "pt"]]

    transcript = get_youtube_transcript(url, languages)

    if isinstance(transcript, dict) and "error" in transcript:
        return transcript

    # If timestamps aren't needed, return just the text
    if not with_timestamps:
        return transcript

    # Return the full transcript if timestamps are needed
    return transcript

# Main function for Windmill.dev
def main(
    url: str, with_timestamps: bool = False, languages: List[str] = ["en", "pt"]
) -> Union[str, Dict]:
    # Ensure the languages list includes only 'en' and 'pt'
    languages = [lang for lang in languages if lang in ["en", "pt"]]

    transcript = get_youtube_transcript(url, languages)

    if isinstance(transcript, dict) and "error" in transcript:
        return transcript

    # If timestamps aren't needed, return just the text
    if not with_timestamps:
        return transcript

    # Return the full transcript if timestamps are needed
    return transcript

12 replies

WWindmill

•Created by Elber Domingos on 10/23/2024 in #help

Weird script python problem

Python code:

from youtube_transcript_api import YouTubeTranscriptApi
import re
from typing import List, Dict, Union


# Function to extract video ID from YouTube URL
def get_video_id(url: str) -> Union[str, None]:
    patterns = [
        r"(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)",
        r"(?:https?:\/\/)?(?:www\.)?youtu\.be\/([^?]+)",
        r"(?:https?:\/\/)?(?:www\.)?youtube\.com\/embed\/([^?]+)",
        r"(?:https?:\/\/)?(?:www\.)?youtube\.com\/v\/([^?]+)",
    ]
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None


# Function to fetch YouTube transcript in specified languages
def get_youtube_transcript(url: str, languages: List[str]) -> Union[str, Dict]:
    video_id = get_video_id(url)
    if not video_id:
        return {"error": "Invalid YouTube URL"}

    try:
        # Fetch transcript in the specified languages
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=languages)
    except Exception as e:
        return {
            "error": str(e),
            "message": f"Could not retrieve a transcript for the video ID {video_id}. "
            f"Subtitles might be disabled or not available in the specified languages.",
        }

    # Join all transcript entries into a single string
    return " ".join([entry["text"] for entry in transcript])

from youtube_transcript_api import YouTubeTranscriptApi
import re
from typing import List, Dict, Union


# Function to extract video ID from YouTube URL
def get_video_id(url: str) -> Union[str, None]:
    patterns = [
        r"(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)",
        r"(?:https?:\/\/)?(?:www\.)?youtu\.be\/([^?]+)",
        r"(?:https?:\/\/)?(?:www\.)?youtube\.com\/embed\/([^?]+)",
        r"(?:https?:\/\/)?(?:www\.)?youtube\.com\/v\/([^?]+)",
    ]
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None


# Function to fetch YouTube transcript in specified languages
def get_youtube_transcript(url: str, languages: List[str]) -> Union[str, Dict]:
    video_id = get_video_id(url)
    if not video_id:
        return {"error": "Invalid YouTube URL"}

    try:
        # Fetch transcript in the specified languages
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=languages)
    except Exception as e:
        return {
            "error": str(e),
            "message": f"Could not retrieve a transcript for the video ID {video_id}. "
            f"Subtitles might be disabled or not available in the specified languages.",
        }

    # Join all transcript entries into a single string
    return " ".join([entry["text"] for entry in transcript])

12 replies