pip install whisperplus
You can find the models on the HuggingFace Model Hub
To use the whisperplus library, follow the steps below for different tasks:
from whisperplus import SpeechToTextPipeline, download_and_convert_to_mp3
url = "https://www.youtube.com/watch?v=di3rHkEZuUw"
audio_path = download_and_convert_to_mp3(url)
pipeline = SpeechToTextPipeline(model_id="openai/whisper-large-v3")
transcript = pipeline(audio_path, "openai/whisper-large-v3", "english")
print(transcript)
from whisperplus import TextSummarizationPipeline
summarizer = TextSummarizationPipeline(model_id="facebook/bart-large-cnn")
summary = summarizer.summarize(transcript)
print(summary[0]["summary_text"])
from whisperplus import LongTextSummarizationPipeline
summarizer = LongTextSummarizationPipeline(model_id="facebook/bart-large-cnn")
summary_text = summarizer.summarize(transcript)
print(summary_text)
from whisperplus import (
ASRDiarizationPipeline,
download_and_convert_to_mp3,
format_speech_to_dialogue,
)
audio_path = download_and_convert_to_mp3("https://www.youtube.com/watch?v=mRB14sFHw2E")
device = "cuda" # cpu or mps
pipeline = ASRDiarizationPipeline.from_pretrained(
asr_model="openai/whisper-large-v3",
diarizer_model="pyannote/speaker-diarization",
use_auth_token=False,
chunk_length_s=30,
device=device,
)
output_text = pipeline(audio_path, num_speakers=2, min_speaker=1, max_speaker=2)
dialogue = format_speech_to_dialogue(output_text)
print(dialogue)
from whisperplus.pipelines.chatbot import ChatWithVideo
chat = ChatWithVideo(
input_file="trascript.txt",
llm_model_name="TheBloke/Mistral-7B-v0.1-GGUF",
llm_model_file="mistral-7b-v0.1.Q4_K_M.gguf",
llm_model_type="mistral",
embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
)
query = "what is this video about ?"
response = chat.run_query(query)
print(response)
from whisperplus import AutoLLMChatWithVideo
# service_context_params
system_prompt = """
You are an friendly ai assistant that help users find the most relevant and accurate answers
to their questions based on the documents you have access to.
When answering the questions, mostly rely on the info in documents.
"""
query_wrapper_prompt = """
The document information is below.
---------------------
{context_str}
---------------------
Using the document information and mostly relying on it,
answer the query.
Query: {query_str}
Answer:
"""
chat = AutoLLMChatWithVideo(
input_file="input_dir", # path of mp3 file
openai_key="YOUR_OPENAI_KEY", # optional
huggingface_key="YOUR_HUGGINGFACE_KEY", # optional
llm_model="gpt-3.5-turbo",
llm_max_tokens="256",
llm_temperature="0.1",
system_prompt=system_prompt,
query_wrapper_prompt=query_wrapper_prompt,
embed_model="huggingface/BAAI/bge-large-zh", # "text-embedding-ada-002"
)
query = "what is this video about ?"
response = chat.run_query(query)
print(response)
from whisperplus import TextToSpeechPipeline
tts = TextToSpeechPipeline(model_id="suno/bark")
audio = tts(text="Hello World", voice_preset="v2/en_speaker_6")
from whisperplus import WhisperAutoCaptionPipeline
caption = WhisperAutoCaptionPipeline(model_id="openai/whisper-large-v3")
caption(video_path="test.mp4", output_path="output.mp4", language="turkish")
pip install -r dev-requirements.txt
pre-commit install
pre-commit run --all-files
This project is licensed under the terms of the Apache License 2.0.
@misc{radford2022whisper,
doi = {10.48550/ARXIV.2212.04356},
url = {https://arxiv.org/abs/2212.04356},
author = {Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya},
title = {Robust Speech Recognition via Large-Scale Weak Supervision},
publisher = {arXiv},
year = {2022},
copyright = {arXiv.org perpetual, non-exclusive license}
}