license: cc-by-4.0
track_downloads: true
language:
- en
- es
- fr
- de
- bg
- hr
- cs
- da
- nl
- et
- fi
- el
- hu
- it
- lv
- lt
- mt
- pl
- pt
- ro
- sk
- sl
- sv
- ru
- uk
pipeline_tag: automatic-speech-recognition
library_name: nemo
datasets:
- nvidia/Granary
- nemo/asr-set-3.0
thumbnail: null
tags:
- automatic-speech-recognition
- speech
- audio
- Transducer
- TDT
- FastConformer
- Conformer
- pytorch
- NeMo
- hf-asr-leaderboard
widget:
- example_title: Librispeech sample 1
src: https://cdn-media.huggingface.co/speech_samples/sample1.flac
- example_title: Librispeech sample 2
src: https://cdn-media.huggingface.co/speech_samples/sample2.flac
model-index:
- name: parakeet-tdt-0.6b-v3
results:
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: AMI (Meetings test)
type: edinburghcstr/ami
config: ihm
split: test
args:
language: en
metrics:
- name: Test WER
type: wer
value: 11.31
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: Earnings-22
type: revdotcom/earnings22
split: test
args:
language: en
metrics:
- name: Test WER
type: wer
value: 11.42
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: GigaSpeech
type: speechcolab/gigaspeech
split: test
args:
language: en
metrics:
- name: Test WER
type: wer
value: 9.59
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: LibriSpeech (clean)
type: librispeech_asr
config: other
split: test
args:
language: en
metrics:
- name: Test WER
type: wer
value: 1.93
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: LibriSpeech (other)
type: librispeech_asr
config: other
split: test
args:
language: en
metrics:
- name: Test WER
type: wer
value: 3.59
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: SPGI Speech
type: kensho/spgispeech
config: test
split: test
args:
language: en
metrics:
- name: Test WER
type: wer
value: 3.97
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: tedlium-v3
type: LIUM/tedlium
config: release1
split: test
args:
language: en
metrics:
- name: Test WER
type: wer
value: 2.75
- task:
name: Automatic Speech Recognition
type: automatic-speech-recognition
dataset:
name: Vox Populi
type: facebook/voxpopuli
config: en
split: test
args:
language: en
metrics:
- name: Test WER
type: wer
value: 6.14
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: bg_bg
split: test
args:
language: bg
metrics:
- name: Test WER (Bg)
type: wer
value: 12.64
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: cs_cz
split: test
args:
language: cs
metrics:
- name: Test WER (Cs)
type: wer
value: 11.01
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: da_dk
split: test
args:
language: da
metrics:
- name: Test WER (Da)
type: wer
value: 18.41
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: de_de
split: test
args:
language: de
metrics:
- name: Test WER (De)
type: wer
value: 5.04
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: el_gr
split: test
args:
language: el
metrics:
- name: Test WER (El)
type: wer
value: 20.7
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: en_us
split: test
args:
language: en
metrics:
- name: Test WER (En)
type: wer
value: 4.85
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: es_419
split: test
args:
language: es
metrics:
- name: Test WER (Es)
type: wer
value: 3.45
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: et_ee
split: test
args:
language: et
metrics:
- name: Test WER (Et)
type: wer
value: 17.73
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: fi_fi
split: test
args:
language: fi
metrics:
- name: Test WER (Fi)
type: wer
value: 13.21
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: fr_fr
split: test
args:
language: fr
metrics:
- name: Test WER (Fr)
type: wer
value: 5.15
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: hr_hr
split: test
args:
language: hr
metrics:
- name: Test WER (Hr)
type: wer
value: 12.46
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: hu_hu
split: test
args:
language: hu
metrics:
- name: Test WER (Hu)
type: wer
value: 15.72
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: it_it
split: test
args:
language: it
metrics:
- name: Test WER (It)
type: wer
value: 3
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: lt_lt
split: test
args:
language: lt
metrics:
- name: Test WER (Lt)
type: wer
value: 20.35
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: lv_lv
split: test
args:
language: lv
metrics:
- name: Test WER (Lv)
type: wer
value: 22.84
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: mt_mt
split: test
args:
language: mt
metrics:
- name: Test WER (Mt)
type: wer
value: 20.46
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: nl_nl
split: test
args:
language: nl
metrics:
- name: Test WER (Nl)
type: wer
value: 7.48
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: pl_pl
split: test
args:
language: pl
metrics:
- name: Test WER (Pl)
type: wer
value: 7.31
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: pt_br
split: test
args:
language: pt
metrics:
- name: Test WER (Pt)
type: wer
value: 4.76
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: ro_ro
split: test
args:
language: ro
metrics:
- name: Test WER (Ro)
type: wer
value: 12.44
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: ru_ru
split: test
args:
language: ru
metrics:
- name: Test WER (Ru)
type: wer
value: 5.51
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: sk_sk
split: test
args:
language: sk
metrics:
- name: Test WER (Sk)
type: wer
value: 8.82
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: sl_si
split: test
args:
language: sl
metrics:
- name: Test WER (Sl)
type: wer
value: 24.03
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: sv_se
split: test
args:
language: sv
metrics:
- name: Test WER (Sv)
type: wer
value: 15.08
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: FLEURS
type: google/fleurs
config: uk_ua
split: test
args:
language: uk
metrics:
- name: Test WER (Uk)
type: wer
value: 6.79
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: Multilingual LibriSpeech
type: facebook/multilingual_librispeech
config: spanish
split: test
args:
language: es
metrics:
- name: Test WER (Es)
type: wer
value: 4.39
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: Multilingual LibriSpeech
type: facebook/multilingual_librispeech
config: french
split: test
args:
language: fr
metrics:
- name: Test WER (Fr)
type: wer
value: 4.97
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: Multilingual LibriSpeech
type: facebook/multilingual_librispeech
config: italian
split: test
args:
language: it
metrics:
- name: Test WER (It)
type: wer
value: 10.08
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: Multilingual LibriSpeech
type: facebook/multilingual_librispeech
config: dutch
split: test
args:
language: nl
metrics:
- name: Test WER (Nl)
type: wer
value: 12.78
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: Multilingual LibriSpeech
type: facebook/multilingual_librispeech
config: polish
split: test
args:
language: pl
metrics:
- name: Test WER (Pl)
type: wer
value: 7.28
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: Multilingual LibriSpeech
type: facebook/multilingual_librispeech
config: portuguese
split: test
args:
language: pt
metrics:
- name: Test WER (Pt)
type: wer
value: 7.5
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: CoVoST2
type: covost2
config: de
split: test
args:
language: de
metrics:
- name: Test WER (De)
type: wer
value: 4.84
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: CoVoST2
type: covost2
config: en
split: test
args:
language: en
metrics:
- name: Test WER (En)
type: wer
value: 6.8
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: CoVoST2
type: covost2
config: es
split: test
args:
language: es
metrics:
- name: Test WER (Es)
type: wer
value: 3.41
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: CoVoST2
type: covost2
config: et
split: test
args:
language: et
metrics:
- name: Test WER (Et)
type: wer
value: 22.04
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: CoVoST2
type: covost2
config: fr
split: test
args:
language: fr
metrics:
- name: Test WER (Fr)
type: wer
value: 6.05
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: CoVoST2
type: covost2
config: it
split: test
args:
language: it
metrics:
- name: Test WER (It)
type: wer
value: 3.69
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: CoVoST2
type: covost2
config: lv
split: test
args:
language: lv
metrics:
- name: Test WER (Lv)
type: wer
value: 38.36
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: CoVoST2
type: covost2
config: nl
split: test
args:
language: nl
metrics:
- name: Test WER (Nl)
type: wer
value: 6.5
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: CoVoST2
type: covost2
config: pt
split: test
args:
language: pt
metrics:
- name: Test WER (Pt)
type: wer
value: 3.96
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: CoVoST2
type: covost2
config: ru
split: test
args:
language: ru
metrics:
- name: Test WER (Ru)
type: wer
value: 3
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: CoVoST2
type: covost2
config: sl
split: test
args:
language: sl
metrics:
- name: Test WER (Sl)
type: wer
value: 31.8
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: CoVoST2
type: covost2
config: sv
split: test
args:
language: sv
metrics:
- name: Test WER (Sv)
type: wer
value: 20.16
- task:
type: Automatic Speech Recognition
name: automatic-speech-recognition
dataset:
name: CoVoST2
type: covost2
config: uk
split: test
args:
language: uk
metrics:
- name: Test WER (Uk)
type: wer
value: 5.1
metrics:
- wer
🦜 parakeet-tdt-0.6b-v3: Multilingual Speech-to-Text Model
Description:
parakeet-tdt-0.6b-v3 is a 600-million-parameter multilingual automatic speech recognition (ASR) model designed for high-throughput speech-to-text transcription. It extends the parakeet-tdt-0.6b-v2 model by expanding language support from English to 25 European languages. The model automatically detects the language of the audio and transcribes it without requiring additional prompting. It is part of a series of models that leverage the Granary [1, 2] multilingual corpus as their primary training dataset.
🗣️ Try Demo here: https://huggingface.co/spaces/nvidia/parakeet-tdt-0.6b-v3
Supported Languages:
Bulgarian (bg), Croatian (hr), Czech (cs), Danish (da), Dutch (nl), English (en), Estonian (et), Finnish (fi), French (fr), German (de), Greek (el), Hungarian (hu), Italian (it), Latvian (lv), Lithuanian (lt), Maltese (mt), Polish (pl), Portuguese (pt), Romanian (ro), Slovak (sk), Slovenian (sl), Spanish (es), Swedish (sv), Russian (ru), Ukrainian (uk)
This model is ready for commercial/non-commercial use.