|
--- |
|
license: apache-2.0 |
|
datasets: |
|
- Yehor/cv10-uk-testset-clean-punctuated |
|
language: |
|
- uk |
|
base_model: |
|
- openai/whisper-large-v3-turbo |
|
--- |
|
|
|
# Quantized Whisper Large V3 Turbo with calibration on Ukrainian |
|
|
|
Quantized it using https://pypi.org/project/llmcompressor/ |
|
|
|
Data used for calibration: https://huggingface.co/datasets/Yehor/cv10-uk-testset-clean-punctuated |
|
|
|
How to quantize: |
|
https://colab.research.google.com/drive/1TsCMxwq9kqsWV8jabihFN7J78RKgyvnD?usp=sharing |
|
|
|
## Usage |
|
|
|
Install required packages: |
|
|
|
``` |
|
pip install vllm polars |
|
``` |
|
|
|
Run inference: |
|
|
|
```python |
|
import io |
|
import wave |
|
|
|
import numpy as np |
|
import polars as pl |
|
|
|
from vllm import LLM, SamplingParams |
|
|
|
|
|
def bytes_to_numpy(_bytes): |
|
with wave.open(io.BytesIO(_bytes), "rb") as wr: |
|
if (nc := wr.getnchannels()) != 1: |
|
raise ValueError(f"num_channels must be 1, got {nc}") |
|
if (sw := wr.getsampwidth()) != 2: |
|
raise ValueError(f"sample_width must be 2, got {sw}") |
|
|
|
audio_data = wr.readframes(wr.getnframes()) |
|
|
|
return np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0 |
|
|
|
|
|
llm = LLM( |
|
model="Yehor/whisper-large-v3-turbo-quantized-uk", |
|
max_model_len=448, |
|
max_num_seqs=400, |
|
gpu_memory_utilization=0.8, |
|
limit_mm_per_prompt={"audio": 1}, |
|
) |
|
|
|
df = pl.read_parquet("hf://datasets/Yehor/cv10-uk-testset-clean/data/train-*.parquet") |
|
|
|
|
|
for row in df.iter_rows(named=True): |
|
inputs = { |
|
"encoder_prompt": { |
|
"prompt": "", |
|
"multi_modal_data": { |
|
"audio": (bytes_to_numpy(row["audio"]["bytes"]), 16_000,), |
|
}, |
|
}, |
|
"decoder_prompt": "<|startoftranscript|><|uk|><|transcribe|><|notimestamps|>", |
|
} |
|
|
|
sampling_params = SamplingParams( |
|
temperature=1.0, |
|
top_p=1.0, |
|
max_tokens=200, |
|
) |
|
outputs = llm.generate(inputs, sampling_params) |
|
|
|
print(f"PROMPT : {outputs[0].prompt}") |
|
print(f"TRANSCRIPTION: {row['transcription']}") |
|
print(f"PREDICTION: {outputs[0].outputs[0].text}") |
|
print("==========================================") |
|
``` |
|
|