Malaysian-Dataset, We gather Malaysian corpus!
This repository to store corpus for huseinzol05/Malaya.
Speech dataset moved to huseinzol05/malaya-speech/data.
We will keep update this repository overtime.
How we gather dataset?
Social media
- We catch most of live data from Twitter, Facebook and Instagram using crawlers, So we just search using Elasticsearch query.
Translation
- We use Google Translate.
- We use ChatGPT.
- We use Malaya translation.
Data tagged using this is generated from translation.
Semisupervised
Teacher-student
- Supervised small samples and then trained a base model.
- Trained base model predict larger samples, retrain next student models on high confident labelled data.
- Repeat.
LLM
- Generate using ChatGPT.
Data tagged using this is generated from LLM.
Projects
To gather at least 100B tokens of Malaysian texts.
Gather PT3 and SPM level benchmark for LLM. Minimum 50 questions for each benchmark dataset.
To gather multispeaker voices for TTS task.
To gather mixed speech semisupervised using Large model STT.
Notes
- Any missing
mp.py
, get it at https://gist.github.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a - Any missing python scripts, please contact me ASAP or create an issue.
- Please at least email us first before distributing these data. Remember all these hard workings we want to give it for free.
- What do you see just the data, but nobody can see how much we spent our cost to make it public.
Suggestion
- Feel free to contact me to request new dataset.
- Feel free to open an issue if the link to dataset is forbidden, sometime I forgot to make it open to public.
Non-commercial Usage
A lot of data here semisupervised / translated / tagged / decoded using third party software, example, Google Translate, Google Speech, so to avoid any future complication, it is better not use this data for commercial purposes but allow for certain research purposes.
Acknowledgement
Thanks to Im Big, LigBlou, Mesolitica and KeyReply for sponsoring AWS Google and private cloud to deploy distributed crawlers.
Chatbot
Alpaca
Total size: 44 MB
@misc{alpaca,
author = {Rohan Taori and Ishaan Gulrajani and Tianyi Zhang and Yann Dubois and Xuechen Li and Carlos Guestrin and Percy Liang and Tatsunori B. Hashimoto },
title = {Stanford Alpaca: An Instruction-following LLaMA model},
year = {2023},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/tatsu-lab/stanford_alpaca}},
}
Code context
Total size: 61.3 MB
Code Instructions
Total size: 91.9 MB
GPT4ALL
Total size: 1020 MB
@misc{gpt4all,
author = {Yuvanesh Anand and Zach Nussbaum and Brandon Duderstadt and Benjamin Schmidt and Andriy Mulyar},
title = {GPT4All: Training an Assistant-style Chatbot with Large Scale Data Distillation from GPT-3.5-Turbo},
year = {2023},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/nomic-ai/gpt4all}},
}
Dolly15k
Total size: 25.6 MB
@misc{gpt4all,
author = {databrickslabs},
title = {Dolly},
year = {2023},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/databrickslabs/dolly}},
}
GPT4ALL
Total size: 1352 MB
@misc{gpt4all,
author = {Yuvanesh Anand and Zach Nussbaum and Brandon Duderstadt and Benjamin Schmidt and Andriy Mulyar},
title = {GPT4All: Training an Assistant-style Chatbot with Large Scale Data Distillation from GPT-3.5-Turbo},
year = {2023},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/nomic-ai/gpt4all}},
}
GPT4ALL-v1.3
Total size: 1520 MB
@misc{gpt4all,
author = {Yuvanesh Anand and Zach Nussbaum and Brandon Duderstadt and Benjamin Schmidt and Andriy Mulyar},
title = {GPT4All: Training an Assistant-style Chatbot with Large Scale Data Distillation from GPT-3.5-Turbo},
year = {2023},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/nomic-ai/gpt4all}},
}
Lamini-LM
Total size: 2710 MB
@article{lamini-lm,
author = {Minghao Wu and
Abdul Waheed and
Chiyu Zhang and
Muhammad Abdul-Mageed and
Alham Fikri Aji
},
title = {LaMini-LM: A Diverse Herd of Distilled Models from Large-Scale Instructions},
journal = {CoRR},
volume = {abs/2304.14402},
year = {2023},
url = {https://arxiv.org/abs/2304.14402},
eprinttype = {arXiv},
eprint = {2304.14402}
}
NSText2SQL
Total size: 532 MB
@software{numbersstation2023NSText2SQL,
author = {Numbers Station Labs},
title = {NSText2SQL: An Open Source Text-to-SQL Dataset for Foundation Model Training},
month = {July},
year = {2023},
url = {https://github.com/NumbersStationAI/NSQL},
}
NSText2SQL
Total size: 532 MB
@software{numbersstation2023NSText2SQL,
author = {Numbers Station Labs},
title = {NSText2SQL: An Open Source Text-to-SQL Dataset for Foundation Model Training},
month = {July},
year = {2023},
url = {https://github.com/NumbersStationAI/NSQL},
}
oasst1
Total size: 65.4 MB
@misc{köpf2023openassistant,
title={OpenAssistant Conversations -- Democratizing Large Language Model Alignment},
author={Andreas Köpf and Yannic Kilcher and Dimitri von Rütte and Sotiris Anagnostidis and Zhi-Rui Tam and Keith Stevens and Abdullah Barhoum and Nguyen Minh Duc and Oliver Stanley and Richárd Nagyfi and Shahul ES and Sameer Suri and David Glushkov and Arnav Dantuluri and Andrew Maguire and Christoph Schuhmann and Huu Nguyen and Alexander Mattick},
year={2023},
eprint={2304.07327},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
OIG
Total size: 1264 MB
@misc{köpf2023openassistant,
title={OpenAssistant Conversations -- Democratizing Large Language Model Alignment},
author={Andreas Köpf and Yannic Kilcher and Dimitri von Rütte and Sotiris Anagnostidis and Zhi-Rui Tam and Keith Stevens and Abdullah Barhoum and Nguyen Minh Duc and Oliver Stanley and Richárd Nagyfi and Shahul ES and Sameer Suri and David Glushkov and Arnav Dantuluri and Andrew Maguire and Christoph Schuhmann and Huu Nguyen and Alexander Mattick},
year={2023},
eprint={2304.07327},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
OpenOrca
Total size: 1.5 GB
@misc{mukherjee2023orca,
title={Orca: Progressive Learning from Complex Explanation Traces of GPT-4},
author={Subhabrata Mukherjee and Arindam Mitra and Ganesh Jawahar and Sahaj Agarwal and Hamid Palangi and Ahmed Awadallah},
year={2023},
eprint={2306.02707},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
Blended Skill Talk
Total size: 31.2 MB
@article{DBLP:journals/corr/abs-2004-08449,
author = {Eric Michael Smith and
Mary Williamson and
Kurt Shuster and
Jason Weston and
Y{-}Lan Boureau},
title = {Can You Put it All Together: Evaluating Conversational Agents' Ability
to Blend Skills},
journal = {CoRR},
volume = {abs/2004.08449},
year = {2020},
url = {https://arxiv.org/abs/2004.08449},
archivePrefix = {arXiv},
eprint = {2004.08449},
timestamp = {Sat, 23 Jan 2021 01:20:50 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2004-08449.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
ConvAI2
Total size: 127.9 MB
@article{DBLP:journals/corr/abs-1902-00098,
author = {Emily Dinan and
Varvara Logacheva and
Valentin Malykh and
Alexander H. Miller and
Kurt Shuster and
Jack Urbanek and
Douwe Kiela and
Arthur Szlam and
Iulian Serban and
Ryan Lowe and
Shrimai Prabhumoye and
Alan W. Black and
Alexander I. Rudnicky and
Jason Williams and
Joelle Pineau and
Mikhail S. Burtsev and
Jason Weston},
title = {The Second Conversational Intelligence Challenge (ConvAI2)},
journal = {CoRR},
volume = {abs/1902.00098},
year = {2019},
url = {http://arxiv.org/abs/1902.00098},
archivePrefix = {arXiv},
eprint = {1902.00098},
timestamp = {Sat, 23 Jan 2021 01:11:58 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-1902-00098.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
Wiki Wizard
Total size: 275.0 MB
@article{DBLP:journals/corr/abs-1811-01241,
author = {Emily Dinan and
Stephen Roller and
Kurt Shuster and
Angela Fan and
Michael Auli and
Jason Weston},
title = {Wizard of Wikipedia: Knowledge-Powered Conversational agents},
journal = {CoRR},
volume = {abs/1811.01241},
year = {2018},
url = {http://arxiv.org/abs/1811.01241},
archivePrefix = {arXiv},
eprint = {1811.01241},
timestamp = {Sat, 23 Jan 2021 01:19:39 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-1811-01241.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
DialoGPT
Total size: 5.6 GB
@article{DBLP:journals/corr/abs-1911-00536,
author = {Yizhe Zhang and
Siqi Sun and
Michel Galley and
Yen{-}Chun Chen and
Chris Brockett and
Xiang Gao and
Jianfeng Gao and
Jingjing Liu and
Bill Dolan},
title = {DialoGPT: Large-Scale Generative Pre-training for Conversational Response
Generation},
journal = {CoRR},
volume = {abs/1911.00536},
year = {2019},
url = {http://arxiv.org/abs/1911.00536},
archivePrefix = {arXiv},
eprint = {1911.00536},
timestamp = {Tue, 05 Jan 2021 15:06:52 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-1911-00536.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
Taskmaster
Total size: 94 MB
@inproceedings{48484,
title = {Taskmaster-1: Toward a Realistic and Diverse Dialog Dataset},
author = {Bill Byrne and Karthik Krishnamoorthi and Chinnadhurai Sankar and Arvind Neelakantan and Daniel Duckworth and Semih Yavuz and Ben Goodrich and Amit Dubey and Kyu-Young Kim and Andy Cedilnik},
year = {2019}
}
Corpus
Audience Nationality
Total size: 246 KB
- constituency
- national
@misc{eight_2016, title={Political Social Media Posts}, url={https://www.kaggle.com/crowdflower/political-social-media-posts}, journal={Kaggle}, author={Eight, Figure}, year={2016}, month={Nov}}
Twitter Emotion
Total size: 27.4 MB
- Anger, 108813 rows
- Fear, 20316 rows
- Happy, 30962 rows
- love, 20783 rows
- Sadness, 26468 rows
- Surprise, 13107 rows
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Semi-Supervised Emotion dataset,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/corpus/emotion}}
}
Gender
Total size: 2.2 MB
- Unknown
- Male
- Female
- Brand
@misc{eight_2016, title={Twitter User Gender Classification}, url={https://www.kaggle.com/crowdflower/twitter-user-gender-classification}, journal={Kaggle}, author={Eight, Figure}, year={2016}, month={Nov}}
Reference: https://www.kaggle.com/crowdflower/twitter-user-gender-classification
Insincere question
Total size: 60.4 MB
- Negative
- Positive
@misc{kaggle, title={Quora Insincere Questions Classification}, url={https://www.kaggle.com/c/quora-insincere-questions-classification}, journal={Kaggle}}
Irony
Total size: 465 KB
- Positive
- Negative
@misc{tatman_2017, title={Ironic Corpus}, url={https://www.kaggle.com/rtatman/ironic-corpus}, journal={Kaggle}, author={Tatman, Rachael}, year={2017}, month={Jul}}
Language-detection
- english
- malay
- indonesia
- rojak
- manglish
- others
sublanguages,
- malay
- kedah
- johor
- melaka
- terengganu
- sarawak
- negeri-sembilan
- kelantan
- pahang
- perak
- sabah
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Lexicon based Language Detection dataset,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/corpus/language-detection}}
}
Malaysia-entities
Social media texts related to Malaysia entities.
Total size: 190.1 MB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Lexicon based Malaysia Entities dataset,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/corpus/malaysia-entities}}
}
Complete list (210 entities)
- mahathir
- anwar ibrahim
- najib razak
- pakatan harapan
- syed saddiq
- parti keadilan rakyat
- umno
- barisan nasional
- parti islam semalaysia
- nurul izzah
- tunku ismail idris
- mca
- democratic action party
- parti amanah
- ppbm
- mic
- tun daim zainuddin
- datuk seri abdul hadi awang
- majlis pakatan harapan
- wan azizah
- parti pribumi bersatu malaysia
- datuk seri azmin ali
- datuk johari abdul
- tengku razaleigh hamzah
- tan sri dr rais yatim
- rafizi ramli
- bersatu
- bernama
- donald trump
- perkasa
- tan sri mokhzani mahathir
- rais yatim
- anthony loke siew fook
- rosmah mansur
- arul kanda
- zeti aziz
- robert kuok
- hassan merican
- ks jomo
- jho low
- kadir jasin
- zakir naik
- bung mokhtar
- shafie apdal
- ariff md yusof
- felda
- dato vida
- jabatan perancangan bandar desa
- jabatan perdana menteri malaysia
- kementerian kewangan malaysia
- kementerian dalam negeri malaysia
- kementerian perdagangan dalam negeri hal ehwal pengguna malaysia
- kementerian luar negeri malaysia
- kementerian pertahanan malaysia
- kementerian pendidikan malaysia
- kementerian pembangunan luar bandar
- kementerian kerja raya malaysia
- kementerian kesihatan malaysia
- kementerian komunikasi multimedia malaysia
- kementerian perumahan kerajaan tempatan malaysia
- kementerian pelancongan kebudayaan malaysia
- kementerian pengangkutan malaysia
- kementerian pembangunan wanita keluarga masyarakat malaysia
- kementerian pertanian industri asas tani
- kementerian perusahaan perladangan komoditi
- kementerian perdagangan antarabangsa industri
- kementerian sains teknologi inovasi malaysia
- kementerian sumber manusia malaysia
- kementerian sumber asli alam sekitar malaysia
- kementerian wilayah persekutuan malaysia
- kementerian tenaga teknologi hijau air malaysia
- jabatan perkhidmatan awam malaysia
- jabatan kemajuan islam (jakim) department of islamic development
- jabatan parlimen malaysia
- agensi kelayakan malaysia
- agensi penguatkuasaan maritim malaysia
- bahagian istiadat urusetia persidangan antarabangsa
- bahagian hal ehwal undang-undang
- bahagian kabinet perlembangan perhubungan antara kerajaan
- bahagian kemajuan wilayah persekutuan perancangan lembah klang
- bahagian keselamatan negara
- bahagian pengurusan hartanah
- bahagian pengurusan perkhidmatan sumber manusia
- bahagian penyelidikan
- biro bantuan guaman
- biro pengaduan awam
- biro tatanegara
- istana negara
- institut kefahaman islam malaysia
- institut latihan kehakiman perundangan
- pejabat ketua setiausaha negara
- pejabat perdana menteri
- jabatan peguam negara
- majlis agama islam wilayah persekutuan
- masjid negara
- pejabat ketua pegawai keselamatan kerajaan malaysia
- pejabat setiausaha persekutuan sabah
- perpustakaan kuala lumpur
- pejabat setiausaha persekutuan sarawak
- lembaga tabung haji
- penasihat sains
- jabatan audit negara malaysia
- jabatan pertahanan awam malaysia
- suruhanjaya pengankutan awam darat
- perbendaharaan malaysia
- majlis tindakan ekonomik negara
- jabatan perangkaan (jp) department of statistics
- polis diraja malaysia
- ikatan relawan rakyat malaysia
- jabatan penjara malaysia
- jabatan pendaftaran negara malaysia
- lembaga penapisan filem
- jabatan imigresen malaysia
- suruhanjaya syarikat malaysia
- suruhanjaya koperasi malaysia
- perbadanan harta intelek malaysia
- bank kerjasama rakyat malaysia
- perbadanan nasional berhad
- maktab koperasi malaysia
- suruhanjaya persaingan malaysia
- institut diplomasi hal ehwal luar negeri
- angkatan tentera malaysia
- tentera darat malaysia
- tentera udara diraja malaysia
- tentera laut diraja malaysia
- program latihan khidmat negara
- dewan bahasa pustaka
- institut pendidikan guru malaysia
- perbadanan tabung pendidikan tinggi nasional
- institut terjemahan negara malaysia
- kejora
- felcra
- risda
- jabatan kerja raya malaysia
- lembaga lebuhraya malaysia
- lembaga jurutera malaysia
- lembaga pembangunan industri pembinaan
- institut jantung negara
- klinik 1malaysia
- insitut kanser negara
- radio televisyen malaysia
- suruhanjaya komunikasi multimedia malaysia
- jabatan penerangan malaysia
- jabatan perancangan bandar desa semenanjung malaysia
- jabatan bomba penyelamat malaysia
- jabatan perumahan negara
- jabatan kerajaan tempatan
- jabatan landskap negara
- jabatan pengurusan sisa pepejal negara
- tribunal perumahan pengurusan strata
- perbadanan pengurusan sisa pepejal pembersihan awam
- jabatan pelancongan malaysia
- jabatan pengangkutan jalan
- jabatan penerbangan awam
- lembaga pelabuhan klang
- jabatan laut malaysia
- jabatan keselamatan jalan raya
- lembaga pelabuhan kuantan
- lembaga pelabuhan johor
- lembaga pelabuhan pulau pinang
- jabatan kebajikan masyarakat malaysia
- institut penyelidikan kemajuan pertanian malaysia
- lembaga kemajuan ikan malaysia
- lembaga pemasaran pertanian persekutuan
- jabatan pertanian malaysia
- lembaga pertubuhan peladang
- lembaga kemajuan pertanian kemubu
- lembaga kemajuan pertanian muda
- jabatan perikanan
- jabatan perkhidmatan veterinar
- lembaga perindustrian nanas malaysia
- tabung ekonomi kumpulan usaha niaga
- bank pertanian
- lembaga minyak sawit malaysia
- lembaga pembangunan pelaburan malaysia
- agensi nuklear malaysia
- institut penyelidikan teknologi nuklear malaysia
- pusat sains negara
- jabatan kimia malaysia
- jabatan meteorologi malaysia
- jabatan perkhidmatan awam
- institut tadbiran awam negara
- jabatan agama islam wilayah persekutuan
- jabatan tenaga kerja semenanjung malaysia
- jabatan alam sekitar
- jabatan pengairan saliran
- jabatan tanah galian wilayah persekutuan
- jabatan perlindungan hidupan liar taman negara
- dewan bandaraya kuala lumpur
- perbadanan putrajaya
- perbadanan labuan
- jabatan bekalan air
- jabatan perkhidmatan pembetungan
- suruhanjaya tenaga
- suruhanjaya perkhidmatan air negara
- malaysian green technology corporation
- yayasan hijau malaysia
- mahkamah persekutuan
- mahkamah syariah wilayah persekutuan
- suruhanjaya perdagangan komoditi
- suruhanjaya perkhidmatan awam
- suruhanjaya perkhidmatan pendidikan
- suruhanjaya pilihan raya
- suruhanjaya pencegahan rasuah malaysia
- tribunal perkhidmatan awam
- unit khas teknologi tinggi
- unit pemodenan tadbiran perancangan pengurusan malaysia
- unit perancang ekonomi
- unit penyelarasan pelaksanaan
- urusetia persidangan antarabangsa protokol
Malaysia Topics
Social media texts related to Malaysia topics.
Total size: 322.4 MB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Lexicon based Malaysia Topics dataset,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/corpus/malaysia-topics}}
}
Complete list (249 topics)
- ganja
- orang asli
- kaum cina
- k-pop
- kaum india
- pos laju
- hari raya aidilfitri
- hari raya aidiladha
- syarikat permulaan
- isu tanah
- kaum melayu
- keluar parti
- sabotaj parti
- kotak undi
- humanoid
- kemalangan penumpang cedera
- kemalangan maut
- individu penjara
- kes rogol
- kes cabul
- kes rompakan
- kes ragut
- cambridge analytica
- kokain
- bebas tahanan
- sosial media
- mati dipukul
- pengedar dadah
- kematian wabak
- letupan bom
- isu dadah
- isu bmf
- isu diesel
- isu china
- isu saudi arabia
- unifi
- piala thomas
- fifa
- bahasa pengaturcaraan
- baling botol
- perkahwinan kanak-kanak
- produk berbahaya
- musim durian
- world cup
- motogp
- euro 2020
- ask me a question
- thai cave
- racist
- bola sepak
- hockey
- sepak takraw
- reformasi
- deepavali
- chinese new year
- lazada sells
- shopee sells
- e-sport
- valve corporation
- dota2
- counter strike global-offensive
- asean football organization
- blackpink
- kecurian kereta
- kecurian motosikal
- youtube rewind
- pewdiepie
- isu tiket
- kuota haji
- tsunami
- kes lemas
- kes buang bayi
- kes pecah rumah
- paedophilia
- kes luar nikah
- kes tangkap basah
- kes bawah umur
- pdrm
- 1mdb
- gst
- sst
- tiga penjuru
- pilihan raya umum
- pilihan raya kecil
- pusat daerah mangundi
- masalah air
- rumah mampu milik
- pendidikan
- sekolah
- universiti
- maktab rendah sains mara
- kesihatan
- hutang negara
- ekonomi
- sosial
- menteri besar kedah
- menteri besar perak
- menteri besar perlis
- menteri besar selangor
- menteri besar johor
- menteri besar kelantan
- menteri besar terengganu
- menteri besar negeri sembilan
- felda
- kwsp
- sosco
- bank malaysia
- bank negara
- perdana menteri
- timbalan perdana menteri
- menteri dalam negeri
- menteri kewangan
- menteri pertahanan
- menteri belia dan sukan
- majlis penasihat
- skim peduli sihat
- ptptn
- projek mega
- gaji minimum
- menyiasat skandal
- highway tol
- tabung haji
- tentera malaysia
- infrastruktur
- kos sara hidup
- pengangkutan awam
- perkhidmatan awam
- isu wanita
- survei institut darul ehsan
- inisiatif peduli rakyat
- teknologi
- internet
- kecerdasan buatan
- ahli dewan undangan negeri
- suruhanjaya pilihan raya malaysia
- kertas undi
- akta pilihan raya
- undi pos
- undi rosak
- harga minyak
- petrol
- subsidi kerajaan
- mh370
- gaji menteri
- jabatan bubar
- telekom malaysia
- agama
- lgbt
- agama islam
- masyarakat
- liberalisme
- kapitalisme
- idealogi
- parlimen
- pusat transformasi bandar
- institut diraja
- tsunami fitnah
- makro-ekonomi
- mikro-ekonomi
- pasaran saham malaysia
- pendapatan negara
- nilai ringgit jatuh
- gaji median
- bursa malaysia
- malaysia baru
- keluar parlimen
- dewan rakyat
- tabung harapan
- isu singapura
- isu rohingya
- isu syria
- malaysia-indonesia
- isu gaza
- isu palestin
- isu yaman
- harimau malaya
- isu kuil
- isu lynas
- isu masjid
- isu sosma
- isu ecrl
- royalti minyak
- kes rasuah
- kewangan dan perniagaan
- saham dan komoditi
- isu kerugian
- bumiputera
- alam sekitar
- isu kemiskinan
- sumber asli
- pertanian malaysia
- pertanian durian
- pertanian padi
- pertanian getah
- pertanian kelapa sawit
- pertanian pisang
- pertanian nenas
- akuakultur malaysia
- hortikultur malaysia
- icerd
- yang di-pertuan agong
- perlembagaan malaysia
- malaysia airlines
- malaysia airport
- kuala lumpur international airport
- malacca airport
- bintulu airport
- kota kinabalu airport
- kuching airport
- labuan airport
- lahad datu airport
- langkawi airport
- limbang airport
- miri airport
- penang airport
- sandakan airport
- sibu airport
- sultan abdul halim airport
- sultan haji ahmad shah airport
- sultan azlan shah airport
- sultan ismail petra airport
- sultan mahmud airport
- tawau airport
- tioman airport
- anggota bomba
- angkatan tentera darat
- angkatan tentera laut
- angkatan tentera udara
- anggota ambulans
- anggota polis
- perkhidmatan kehakiman
- perkhidmatan am persekutuan
- industri 4.0
- kumpulan pengganas tempatan
- kumpulan pengganas asing
- sultan selangor
- sultan kedah
- sultan kelantan
- sultan perlis
- sultan johor
- sultan negeri sembilan
- sultan terengganu
- pemilihan agong
- isu plastik
- gejala sosial
- isytihar darurat
Metadata Amazon reviews
Total size: 10.365 GB
Justifying recommendations using distantly-labeled reviews and fined-grained aspects
Jianmo Ni, Jiacheng Li, Julian McAuley
Empirical Methods in Natural Language Processing (EMNLP), 2019
Sarcastic news-headline
Total size: 1.78 MB
- Positive
- Negative
@misc{misra_2019, title={News Headlines Dataset For Sarcasm Detection}, url={https://www.kaggle.com/rmisra/news-headlines-dataset-for-sarcasm-detection}, journal={Kaggle}, author={Misra, Rishabh}, year={2019}, month={Jul}}
Subjectivity
Total size: 1.4 MB
- Positive
- Negative
@InProceedings{Pang+Lee:04a,
author = {Bo Pang and Lillian Lee},
title = {A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts},
booktitle = "Proceedings of the ACL",
year = 2004
}
Substring Language Detection
Total size: 542 MB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Substring language detection,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/corpus/substring-language-detection}}
}
Toxicity-small
Total size: 69 MB
Toxicity-small is multilabels and multiclasses, prefer to use sigmoid / logistic.
- toxic
- severe toxic
- obscene
- threat
- insult
- identity hate
@misc{kaggle, title={Toxic Comment Classification Challenge}, url={https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge}, journal={Kaggle}}
Toxicity-large
Total size: 640 MB
Toxicity-large is multilabels and multiclasses, prefer to use sigmoid / logistic.
- severe toxic
- obscene
- identity attack
- insult
- threat
- asian
- atheist
- bisexual
- black
- buddhist
- christian
- female
- heterosexual
- hindu
- homosexual, gay or lesbian
- intellectual or learning disability
- jewish
- latino
- male
- muslim
- other disability
- other gender
- other race or ethnicity
- other religion
- other sexual orientation
- physical disability
- psychiatric or mental illness
- transgender
- white
- malay
- chinese
@misc{kaggle, title={Jigsaw Multilingual Toxic Comment Classification}, url={https://www.kaggle.com/c/jigsaw-multilingual-toxic-comment-classification}, journal={Kaggle}}
Added label 14, 29, 30, 31 by myself.
Political landscape
Total size: 2 MB
- Kerajaan (BN)
- Pembangkang (PAS, DAP, PKR)
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Lexicon based Political Landscape Detection dataset,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/corpus/political-landscape}}
}
This polarity is based on 2018 political landscape.
NSFW
Total size: 85.9 MB
- Sex, 1383577 texts
- Gambling, 256168 texts
- negative, dumping/common-crawl
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Lexicon based NSFW Detection dataset,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/corpus/nsfw}}
}
The Pile
Total size: 22.7 GB
@article{DBLP:journals/corr/abs-2101-00027,
author = {Leo Gao and
Stella Biderman and
Sid Black and
Laurence Golding and
Travis Hoppe and
Charles Foster and
Jason Phang and
Horace He and
Anish Thite and
Noa Nabeshima and
Shawn Presser and
Connor Leahy},
title = {The Pile: An 800GB Dataset of Diverse Text for Language Modeling},
journal = {CoRR},
volume = {abs/2101.00027},
year = {2021},
url = {https://arxiv.org/abs/2101.00027},
archivePrefix = {arXiv},
eprint = {2101.00027},
timestamp = {Thu, 21 Jan 2021 14:42:30 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2101-00027.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
goemotions
Total size: 78.9 MB
@article{DBLP:journals/corr/abs-2005-00547,
author = {Dorottya Demszky and
Dana Movshovitz{-}Attias and
Jeongwoo Ko and
Alan S. Cowen and
Gaurav Nemade and
Sujith Ravi},
title = {GoEmotions: {A} Dataset of Fine-Grained Emotions},
journal = {CoRR},
volume = {abs/2005.00547},
year = {2020},
url = {https://arxiv.org/abs/2005.00547},
eprinttype = {arXiv},
eprint = {2005.00547},
timestamp = {Fri, 08 May 2020 15:04:04 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2005-00547.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
stopwords
List of stopwords in JSON. To get latest stopwords, get it at https://github.com/huseinzol05/malaya/blob/master/malaya/text/tatabahasa.py
Total size: 14 KB
Dictionary
Not an official released from Dewan Bahasa.
73k English-Malay
Total size: 1.1 MB
Reference: https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ms.txt
200k English-Malay
Total size: 6.9 MB
90k synonym
Total size: 4.7 MB
Dictionary, 24550 unique words
Total size: 428 KB
@misc{Malay language dictionary for Sublime Text,
author = {Fakhrullah},
title = {MalayLanguage},
year = {2016},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/fakhrullah/MalayLanguage}}
}
Dialect
Glossaries for,
- johor
- kedah
- kelantan
- negeri sembilan
- melaka
- pahang
- penang
- sukuan
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Dialect,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/dictionary/dialect}}
}
Ngrams
Total size: 92 MB
Unigram and Bigram collected from news, structure,
{'saya': 1000}
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Ngram,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/dictionary/ngram}}
}
7k antonym
Total size: 200 KB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Antonym,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/dictionary/antonym}}
}
Cambridge English-Malaysian
Crawled from https://dictionary.cambridge.org/browse/english-malaysian/, 25171 english-malaysian words.
Total size: 20 MB
IPA
Mirror for https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/ma.txt, 28k samples
Total size: 600 KB
@misc{open-dict-data, title={Open-dict-data/IPA-dict: Monolingual wordlists with pronunciation information in IPA}, url={https://github.com/open-dict-data/ipa-dict}, journal={GitHub}, author={Open-Dict-Data}}
Emoji
Translated https://unicode.org/Public/emoji/15.0/emoji-test.txt
Total size: 1 MB
Wiktionary
Filtered https://en.wiktionary.org/wiki/Wiktionary:Main_Page on bahasa words.
Total size: 34 MB
DBP
Crawled from https://prpm.dbp.gov.my/Cari1?keyword=
Total size: 25.7 MB
Document Ranking
MSMARCO
Total size: 1.5 GB
@article{DBLP:journals/corr/NguyenRSGTMD16,
author = {Tri Nguyen and
Mir Rosenberg and
Xia Song and
Jianfeng Gao and
Saurabh Tiwary and
Rangan Majumder and
Li Deng},
title = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset},
journal = {CoRR},
volume = {abs/1611.09268},
year = {2016},
url = {http://arxiv.org/abs/1611.09268},
archivePrefix = {arXiv},
eprint = {1611.09268},
timestamp = {Mon, 13 Aug 2018 16:49:03 +0200},
biburl = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
Dumping
CC-100
Total size: 6 GB
Common-crawl
List of mse
language websites only.
Total index size: 25.6 MB
Total website size: 9.6 GB
Total cleaned (removed NSFW) text extracted size: 2.93 GB
Clean
Gathered all dumping texts and applied cleaning and filteration.
Total size: 12.3 GB
Total size: 234 MB
IMDA
Extracted from IMDA dataset, https://www.imda.gov.sg/
Total size: 181 MB
Total size: 418.2 MB, 695571 sentences.
Karangan sekolah
Total size: 221 KB
NLLB
Total size: 2.49 GB
Gathered reddit posts and comments from malaysian and singaporean subreddits.
Total size: 149 MB
Singapore news
Total size: 213.1 MB, 1760382 sentences.
Contributed by https://github.com/brytjy
Singlish text
Singlish is a mix of Chinese, Bahasa, Tamil and majority English, singaporean slang.
Random crawled from different singaporean websites and blogs.
Total size: 1.2 GB, 19870766 sentences.
Contributed by https://github.com/brytjy
Subtitle
Total size: 1.5 MB
Total size: 55.9 GB
Wikipedia
Total size: 243.2 MB, 1748387 sentences.
Generative
CommonGen
Total size: 13.5 MB
@article{lin2019comgen,
author = {Bill Yuchen Lin and Wangchunshu Zhou and Ming Shen and Pei Zhou and Chandra Bhagavatula and Yejin Choi and Xiang Ren},
title = {CommonGen: A Constrained Text Generation Challenge for Generative Commonsense Reasoning},
journal = {Findings of EMNLP},
year = {2020}
}
Keyphrase
kdd
Total size: 3 MB
Originally from https://github.com/boudinfl/ake-datasets
WWW
Total size: 2.7 MB
Originally from https://github.com/boudinfl/ake-datasets
OpenKP
Total size: 1197 MB
article{DBLP:journals/corr/NguyenRSGTMD16,
author = {Tri Nguyen and
Mir Rosenberg and
Xia Song and
Jianfeng Gao and
Saurabh Tiwary and
Rangan Majumder and
Li Deng},
title = {{MS} {MARCO:} {A} Human Generated MAchine Reading COmprehension Dataset},
journal = {CoRR},
volume = {abs/1611.09268},
year = {2016},
url = {http://arxiv.org/abs/1611.09268},
archivePrefix = {arXiv},
eprint = {1611.09268},
timestamp = {Mon, 13 Aug 2018 16:49:03 +0200},
biburl = {https://dblp.org/rec/journals/corr/NguyenRSGTMD16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
KPTimes
Total size: 4.3 GB
@inproceedings{gallina2019kptimes,
title={KPTimes: A Large-Scale Dataset for Keyphrase Generation on News Documents},
author={Gallina, Ygor and Boudin, Florian and Daille, B{\'e}atrice},
booktitle={Proceedings of the 12th International Conference on Natural Language Generation},
pages={130--135},
year={2019}
}
twitter bahasa
Total size: 1580 MB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Extract Keywords from Twitter using Lexicon,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/keyphrase/twitter-bahasa}}
}
Xwikis
Total size: 2057 MB
Lexicon
Malaya provided lexicon generator to induce new lexicons, https://malaya.readthedocs.io/en/latest/Lexicon.html
sentiment
{'negative': ['str1','str2'], 'positive': ['str3','str4']}
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Unsupervised Sentiment Lexicon,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/lexicon}}
}
emotion
{'anger': ['str1'], 'fear': ['str2'], 'joy': ['str3'], 'love': ['str4'], 'sadness': ['str5'], 'surprise': ['str6']}
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Unsupervised Emotion Lexicon,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/lexicon}}
}
LLM
Instruction tasks
Preparation instruction tasks for Malay LLM, finetuning script at https://github.com/huseinzol05/malaya/tree/5.1/session/llama2
News
Fake News
Total size: 122.2 MB
- Negative
- Positive
Malaysia fake news, contributed by syazanihussin,
30k News
Total size: 66.6 MB
Crawled on Google news using these keywords,
strings = [
'bank negara OR kewangan malaysia OR kementerian kewangan',
'mata wang malaysia OR bon malaysia OR saham malaysia',
'perdagangan malaysia OR ekonomi malaysia OR sosial malaysia',
'kementerian malaysia',
'kaum melayu OR kaum cina',
'stock market malaysia OR saham malaysia',
'malaysia parliament OR parlimen malaysia',
'asia OR asean',
'malaysia property OR hartanah malaysia',
'artis OR wanita',
'pendidikan OR kesihatan OR infrastruktur'
'dr mahathir OR wan zizah OR lim guan eng OR muhyiddin OR mohamad sabu OR azmin ali',
'umno OR pkr OR mic OR barisan nasional OR parti amanah OR dap',
'isu kerajaan OR isu pembangkang',
'politik OR malaysia OR dunia OR bisnes',
'sukan OR hiburan OR teknologi OR gaya hidup OR automotif'
'johor OR kedah OR kelantan OR melaka',
'negeri sembilan OR pahang OR pulau pinang OR perak',
'perlis OR sabah OR sarawak OR selangor',
'terengganu OR kuala lumpur OR labuan OR putrajaya',
]
Crawled News
Total size: 156 MB
Crawled News Topics
Total size: 1.2 GB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Malay News based on topics,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/news/news-new}}
}
Complete list (976 news)
- Perayaan Cahaya
- Perayaan Ponggal
- Tahun Baru Hindu
- agama sesat
- air nira
- angan-angan
- angkat berat
- anjing
- antarabangsa
- aplikasi malaysia
- arnab
- arwah ayah
- arwah ibu
- aset digital
- atlet
- babi
- baca buku
- badak sumbu
- bahasa jawa
- bahasa kebangsaan
- bahasa melayu
- banjir
- bankrap
- bawah umur
- belimbing
- berenang
- bergaduh
- bina badan
- bodoh
- bola baling
- bola jaring
- bola keranjang
- boling padang
- buaya
- bulan
- bunian
- burung
- cempedak
- coklat
- cuka
- dakwah islam
- diktator
- disinfeksi
- ditangkap
- dunia islam
- ekonomi islam
- eksport cempedak
- eksport cili padi
- eksport durian
- eksport getah
- eksport kayu
- eksport kelapa sawit
- eksport nenas
- eksport padi
- eksport rambutan
- gajah
- galaksi
- ganti rugi
- gaya baju
- gaya fashion
- gaya jaket
- gaya kasut
- gaya rambut
- gaya rantai
- gaya raya
- gaya seluar
- gaya topi
- gelandangan
- godaan nafsu
- godaan syaitan
- godaan wanita
- godam
- gula apong
- gula
- hantu bungkus
- hantu melayu
- hantu raya
- harga rumah
- hari krismas
- harimau
- hartanah
- hilang kawalan
- hilang kerja
- hoki padang
- hujan lebat
- hujan
- hukum babi
- hutang peribadi
- hutang
- ikan
- imunasi
- industri buku
- industri pertanian
- industri
- isi k-pop
- islam nusantara
- isu 1mdb
- isu Suku Bagahak
- isu Suku Bajau
- isu Suku Brunei
- isu Suku Iban
- isu Suku Idahan
- isu Suku Iranun
- isu Suku Kadazandusun
- isu Suku Lundayeh
- isu Suku Murut
- isu Suku Suluk
- isu Suku Tidong
- isu afghanistan
- isu afrika
- isu agama islam
- isu agama
- isu agensi kelayakan malaysia
- isu agensi nuklear malaysia
- isu agensi penguatkuasaan maritim malaysia
- isu ahli dewan undangan negeri
- isu air
- isu airasia
- isu akta pilihan raya
- isu akuakultur malaysia
- isu alam sekitar
- isu alkohol
- isu amerika
- isu anggota ambulans
- isu anggota bomba
- isu anggota polis
- isu angkatan tentera laut
- isu angkatan tentera malaysia
- isu angkatan tentera udara
- isu anthony loke siew fook
- isu anwar ibrahim
- isu apple
- isu arab
- isu arak
- isu argentina
- isu ariff md yusof
- isu artificial intelligence
- isu artis korea selatan
- isu artis kpop
- isu arul kanda
- isu asean football organization
- isu ask me a question
- isu askar
- isu australia
- isu axiata
- isu ayah pin
- isu ayam penyet
- isu ayam
- isu baba dan nyonya
- isu bahagian hal ehwal undang-undang
- isu bahagian kabinet perlembangan perhubungan antara kerajaan
- isu bahagian kemajuan wilayah persekutuan perancangan lembah klang
- isu bahagian keselamatan negara
- isu bahagian pengurusan hartanah
- isu bahagian pengurusan perkhidmatan sumber manusia
- isu bahagian penyelidikan
- isu bahasa inggeris
- isu bahasa melayu
- isu bahasa pengaturcaraan
- isu baling botol
- isu bangkai
- isu bangladesh
- isu bank kerjasama rakyat malaysia
- isu bank malaysia
- isu bank negara
- isu bank pertanian
- isu barisan nasional
- isu bebas tahanan
- isu berjaya group
- isu bernama
- isu bersatu
- isu big bang
- isu big data
- isu bihun sup
- isu bintulu airport
- isu biro bantuan guaman
- isu biro pengaduan awam
- isu biro tatanegara
- isu biseksual
- isu blackpink
- isu bmw
- isu bola sepak
- isu boling
- isu brazil
- isu brunei
- isu bts
- isu bumi
- isu bumiputera
- isu bung mokhtar
- isu bursa malaysia
- isu cambodia
- isu cambridge analytica
- isu celcom
- isu chinese new year
- isu cikgu
- isu cimb
- isu colombia
- isu costa Rica
- isu counter strike global-offensive
- isu covid
- isu cucms
- isu cukai
- isu daging
- isu dato vida
- isu datuk johari abdul
- isu datuk seri abdul hadi awang
- isu datuk seri azmin ali
- isu deepavali
- isu democratic action party
- isu denmark
- isu dewan bahasa pustaka
- isu dewan bandaraya kuala lumpur
- isu dewan rakyat
- isu diabetes
- isu digi
- isu doktor
- isu donald trump
- isu dota2
- isu e-sport
- isu ekonomi
- isu eropah
- isu euro 2020
- isu ewallet
- isu exo
- isu facebook
- isu felcra
- isu felda
- isu fifa
- isu finland
- isu fizik
- isu foodpanda
- isu futsal
- isu gaji median
- isu gaji menteri
- isu gaji minimum
- isu gamuda berhad
- isu ganja
- isu gay
- isu gejala sosial
- isu german
- isu gimnastik
- isu girl generation
- isu golf
- isu google
- isu grab
- isu grabfood
- isu gst
- isu halal
- isu harga minyak
- isu hari raya aidiladha
- isu hari raya aidilfitri
- isu harimau malaya
- isu hassan merican
- isu highway tol
- isu hockey
- isu honda
- isu hortikultur malaysia
- isu humanoid
- isu hutang negara
- isu hutang
- isu ibm
- isu icerd
- isu idealogi
- isu ikan
- isu ikatan relawan rakyat malaysia
- isu ikea
- isu india
- isu individu penjara
- isu indonesia
- isu industri 4.0
- isu infrastruktur
- isu inisiatif peduli rakyat
- isu insitut kanser negara
- isu instafamous
- isu instagram
- isu institut diplomasi hal ehwal luar negeri
- isu institut diraja
- isu institut jantung negara
- isu institut kefahaman islam malaysia
- isu institut latihan kehakiman perundangan
- isu institut pendidikan guru malaysia
- isu institut penyelidikan kemajuan pertanian malaysia
- isu institut penyelidikan teknologi nuklear malaysia
- isu institut tadbiran awam negara
- isu institut terjemahan negara malaysia
- isu internet
- isu iran
- isu iraq
- isu israel
- isu istana negara
- isu isu badminton
- isu isu bmf
- isu isu china
- isu isu dadah
- isu isu diesel
- isu isu ecrl
- isu isu gaza
- isu isu kemiskinan
- isu isu kerugian
- isu isu kuil
- isu isu lynas
- isu isu masjid
- isu isu palestin
- isu isu plastik
- isu isu rohingya
- isu isu saudi arabia
- isu isu singapura
- isu isu sosma
- isu isu syria
- isu isu tanah
- isu isu tiket
- isu isu wanita
- isu isu yaman
- isu isytihar darurat
- isu itali
- isu jabatan agama islam wilayah persekutuan
- isu jabatan audit negara malaysia
- isu jabatan bekalan air
- isu jabatan bomba penyelamat malaysia
- isu jabatan bubar
- isu jabatan imigresen malaysia
- isu jabatan kebajikan masyarakat malaysia
- isu jabatan kemajuan islam (jakim) department of islamic development
- isu jabatan kerajaan tempatan
- isu jabatan kerja raya malaysia
- isu jabatan keselamatan jalan raya
- isu jabatan kimia malaysia
- isu jabatan landskap negara
- isu jabatan laut malaysia
- isu jabatan meteorologi malaysia
- isu jabatan parlimen malaysia
- isu jabatan peguam negara
- isu jabatan pelancongan malaysia
- isu jabatan pendaftaran negara malaysia
- isu jabatan penerangan malaysia
- isu jabatan penerbangan awam
- isu jabatan pengairan saliran
- isu jabatan pengangkutan jalan
- isu jabatan pengurusan sisa pepejal negara
- isu jabatan penjara malaysia
- isu jabatan perancangan bandar desa semenanjung malaysia
- isu jabatan perancangan bandar desa
- isu jabatan perdana menteri malaysia
- isu jabatan perikanan
- isu jabatan perkhidmatan awam malaysia
- isu jabatan perkhidmatan awam
- isu jabatan perkhidmatan pembetungan
- isu jabatan perkhidmatan veterinar
- isu jabatan perlindungan hidupan liar taman negara
- isu jabatan pertahanan awam malaysia
- isu jabatan pertanian malaysia
- isu jabatan perumahan negara
- isu jabatan tanah galian wilayah persekutuan
- isu jabatan tenaga kerja semenanjung malaysia
- isu jepun
- isu jho low
- isu jordan
- isu judi
- isu k-pop
- isu kadir jasin
- isu kahwin
- isu kapitalisme
- isu kaum cina
- isu kaum india
- isu kaum melayu
- isu kecerdasan buatan
- isu kecurian kereta
- isu kecurian motosikal
- isu kedai alat tulis
- isu kedai baju
- isu kedai basikal
- isu kedai kasut
- isu kedai komputer
- isu kejora
- isu keluar parlimen
- isu keluar parti
- isu kemalangan maut
- isu kemalangan penumpang cedera
- isu kematian wabak
- isu kementerian dalam negeri malaysia
- isu kementerian kerja raya malaysia
- isu kementerian kesihatan malaysia
- isu kementerian kewangan malaysia
- isu kementerian kewangan
- isu kementerian komunikasi multimedia malaysia
- isu kementerian luar negeri malaysia
- isu kementerian pelancongan kebudayaan malaysia
- isu kementerian pembangunan luar bandar
- isu kementerian pembangunan wanita keluarga masyarakat malaysia
- isu kementerian pendidikan malaysia
- isu kementerian pengangkutan malaysia
- isu kementerian perdagangan antarabangsa industri
- isu kementerian perdagangan dalam negeri hal ehwal pengguna malaysia
- isu kementerian pertahanan malaysia
- isu kementerian pertanian industri asas tani
- isu kementerian perumahan kerajaan tempatan malaysia
- isu kementerian perusahaan perladangan komoditi
- isu kementerian sains teknologi inovasi malaysia
- isu kementerian sumber asli alam sekitar malaysia
- isu kementerian sumber manusia malaysia
- isu kementerian tenaga teknologi hijau air malaysia
- isu kementerian wilayah persekutuan malaysia
- isu keracunan
- isu kereta
- isu kertas undi
- isu kes bawah umur
- isu kes buang bayi
- isu kes cabul
- isu kes lemas
- isu kes luar nikah
- isu kes pecah rumah
- isu kes ragut
- isu kes rasuah
- isu kes rogol
- isu kes rompakan
- isu kes tangkap basah
- isu kesihatan
- isu kewangan dan perniagaan
- isu kfc
- isu khazanah
- isu kimia
- isu klinik 1malaysia
- isu kokain
- isu korea selatan
- isu korea utara
- isu kos sara hidup
- isu kota kinabalu airport
- isu kotak undi
- isu kpop
- isu ks jomo
- isu kuala lumpur international airport
- isu kuching airport
- isu kumpulan pengganas asing
- isu kumpulan pengganas tempatan
- isu kuota haji
- isu kwsp
- isu labuan airport
- isu lahad datu airport
- isu laksa
- isu langkawi airport
- isu laos
- isu lazada sells
- isu lembaga jurutera malaysia
- isu lembaga kemajuan ikan malaysia
- isu lembaga kemajuan pertanian kemubu
- isu lembaga kemajuan pertanian muda
- isu lembaga lebuhraya malaysia
- isu lembaga minyak sawit malaysia
- isu lembaga pelabuhan johor
- isu lembaga pelabuhan klang
- isu lembaga pelabuhan kuantan
- isu lembaga pelabuhan pulau pinang
- isu lembaga pemasaran pertanian persekutuan
- isu lembaga pembangunan industri pembinaan
- isu lembaga pembangunan pelaburan malaysia
- isu lembaga penapisan filem
- isu lembaga perindustrian nanas malaysia
- isu lembaga pertubuhan peladang
- isu lembaga tabung haji
- isu lesbian
- isu letupan bom
- isu lgbt
- isu lhdn
- isu liberalisme
- isu mabuk
- isu mahathir
- isu mahkamah persekutuan
- isu mahkamah syariah wilayah persekutuan
- isu majlis agama islam wilayah persekutuan
- isu majlis pakatan harapan
- isu majlis penasihat
- isu majlis tindakan ekonomik negara
- isu makanan malaysia
- isu makro-ekonomi
- isu maktab koperasi malaysia
- isu maktab rendah sains mara
- isu malacca airport
- isu malaysia airlines
- isu malaysia airport
- isu malaysia baru
- isu malaysia-indonesia
- isu malaysian green technology corporation
- isu malware
- isu masalah air
- isu masjid negara
- isu masyarakat
- isu mati dipukul
- isu maybank
- isu mca
- isu mcdonald
- isu media prima
- isu memorandum
- isu menteri alam sekitar dan air
- isu menteri belia dan sukan
- isu menteri besar johor
- isu menteri besar kedah
- isu menteri besar kelantan
- isu menteri besar negeri sembilan
- isu menteri besar perak
- isu menteri besar perlis
- isu menteri besar selangor
- isu menteri besar terengganu
- isu menteri dalam negeri
- isu menteri di jabatan perdana menteri
- isu menteri kanan kerja raya
- isu menteri kanan pendidikan
- isu menteri kanan perdagangan antarabangsa dan industri
- isu menteri kanan pertahanan
- isu menteri kesihatan
- isu menteri kewangan
- isu menteri komunikasi dan multimedia
- isu menteri luar negeri
- isu menteri pelancongan, seni dan budaya
- isu menteri pembangunan luar bandar
- isu menteri pembangunan usahawan dan koperasi
- isu menteri pembangunan, wanita, keluarga dan masyarakat
- isu menteri pengajian tinggi
- isu menteri pengangkutan
- isu menteri perdagangan dalam negeri dan hal ehwal pengguna
- isu menteri perpaduan negara
- isu menteri pertahanan
- isu menteri pertanian dan industri makanan
- isu menteri perumahan dan kerajaan tempatan
- isu menteri perusahaan perladangan dan komoditi
- isu menteri sains, teknologi dan inovasi
- isu menteri sumber manusia
- isu menteri tenaga dan sumber asli
- isu menteri wilayah persekutuan
- isu menyiasat skandal
- isu mercedes
- isu mesir
- isu mexico
- isu mh370
- isu mic
- isu microsoft
- isu mikro-ekonomi
- isu minyak
- isu mira filzah
- isu miri airport
- isu mmu
- isu motogp
- isu motosikal
- isu mrsm
- isu muhyiddin
- isu murtabak
- isu musim durian
- isu mutiara
- isu myanmar
- isu mydin
- isu najib razak
- isu nasa
- isu nasi dagang
- isu nasi kandar
- isu nasi kerabu
- isu nasi
- isu negeri
- isu nepal
- isu new zealand
- isu nilai ringgit jatuh
- isu novel
- isu nurul izzah
- isu orang asli
- isu paedophilia
- isu pakatan harapan
- isu pakistan
- isu palestin
- isu parkir
- isu parlimen
- isu parti amanah
- isu parti islam semalaysia
- isu parti keadilan rakyat
- isu parti pribumi bersatu malaysia
- isu pasaran saham malaysia
- isu pdrm
- isu pejabat ketua pegawai keselamatan kerajaan malaysia
- isu pejabat ketua setiausaha negara
- isu pejabat perdana menteri
- isu pejabat setiausaha persekutuan sabah
- isu pejabat setiausaha persekutuan sarawak
- isu pelajar ipta
- isu pelajar ipts
- isu pelajar luar negara
- isu pelajar maktab
- isu pelajar sekolah menengah
- isu pelajar sekolah rendah
- isu pelajar universiti
- isu pelajar vokasional
- isu pelancongan malaysia
- isu pemilihan agong
- isu penang airport
- isu penasihat sains
- isu pendapatan negara
- isu pendidikan
- isu pengangkutan awam
- isu pengedar dadah
- isu perabot
- isu perancis
- isu perbadanan harta intelek malaysia
- isu perbadanan labuan
- isu perbadanan nasional berhad
- isu perbadanan pengurusan sisa pepejal pembersihan awam
- isu perbadanan putrajaya
- isu perbadanan tabung pendidikan tinggi nasional
- isu perbendaharaan malaysia
- isu perdana menteri
- isu perkahwinan kanak-kanak
- isu perkasa
- isu perkhidmatan am persekutuan
- isu perkhidmatan awam
- isu perkhidmatan kehakiman
- isu perlembagaan malaysia
- isu perodua
- isu perpustakaan kuala lumpur
- isu pertanian durian
- isu pertanian getah
- isu pertanian kelapa sawit
- isu pertanian malaysia
- isu pertanian nenas
- isu pertanian padi
- isu pertanian pisang
- isu petrol
- isu petronas
- isu pewdiepie
- isu piala thomas
- isu pilihan raya kecil
- isu pilihan raya umum
- isu ping pong
- isu plus
- isu polis diraja malaysia
- isu polis
- isu portugal
- isu pos laju
- isu pos malaysia
- isu pos
- isu ppbm
- isu prasarana
- isu privasi
- isu produk berbahaya
- isu program latihan khidmat negara
- isu projek mega
- isu ptptn
- isu pusat daerah mangundi
- isu pusat sains negara
- isu pusat transformasi bandar
- isu racist
- isu radio televisyen malaysia
- isu rafizi ramli
- isu rais yatim
- isu rasuah
- isu reformasi
- isu rhb
- isu risda
- isu robert kuok
- isu rohingya
- isu rosmah mansur
- isu roti canai
- isu roti
- isu royalti minyak
- isu rumah mampu milik
- isu rusia
- isu sabotaj parti
- isu saham dan komoditi
- isu sahur
- isu sains data
- isu sains
- isu sampah
- isu sandakan airport
- isu saudi
- isu sekolah jenis kebangsaan cina
- isu sekolah jenis kebangsaan india
- isu sekolah menengah kebangsaan jenis cina
- isu sekolah menengah kebangsaan jenis india
- isu sekolah
- isu sepak takraw
- isu shafie apdal
- isu shopee sells
- isu sibu airport
- isu sime darby
- isu sirim
- isu siti kasim
- isu skim peduli sihat
- isu sosco
- isu sosial media
- isu sosial
- isu srikandi
- isu ssm
- isu sst
- isu starbucks
- isu subsidi kerajaan
- isu sultan abdul halim airport
- isu sultan azlan shah airport
- isu sultan haji ahmad shah airport
- isu sultan ismail petra airport
- isu sultan johor
- isu sultan kedah
- isu sultan kelantan
- isu sultan mahmud airport
- isu sultan negeri sembilan
- isu sultan perlis
- isu sultan selangor
- isu sultan terengganu
- isu sumbat
- isu sumber asli
- isu sungai
- isu sunway
- isu surau
- isu suruhanjaya komunikasi multimedia malaysia
- isu suruhanjaya koperasi malaysia
- isu suruhanjaya pencegahan rasuah malaysia
- isu suruhanjaya pengankutan awam darat
- isu suruhanjaya perdagangan komoditi
- isu suruhanjaya perkhidmatan air negara
- isu suruhanjaya perkhidmatan awam
- isu suruhanjaya perkhidmatan pendidikan
- isu suruhanjaya persaingan malaysia
- isu suruhanjaya pilihan raya malaysia
- isu suruhanjaya pilihan raya
- isu suruhanjaya syarikat malaysia
- isu suruhanjaya tenaga
- isu survei institut darul ehsan
- isu susu
- isu sweden
- isu syarikat permulaan
- isu syarikat
- isu syed saddiq
- isu syria
- isu tabung ekonomi kumpulan usaha niaga
- isu tabung haji
- isu tabung harapan
- isu taekwondo
- isu tan sri dr rais yatim
- isu tan sri mokhzani mahathir
- isu taska
- isu tawau airport
- isu teknologi
- isu telefon
- isu telegram
- isu telekom malaysia
- isu tengku razaleigh hamzah
- isu tenis
- isu tentera darat malaysia
- isu tentera laut diraja malaysia
- isu tentera malaysia
- isu tentera udara diraja malaysia
- isu thai cave
- isu tiga penjuru
- isu timbalan perdana menteri
- isu tioman airport
- isu tipu sijil
- isu tng
- isu touch n go
- isu toyota
- isu transeksual
- isu transgender
- isu tribunal perkhidmatan awam
- isu tribunal perumahan pengurusan strata
- isu trojan
- isu tsunami fitnah
- isu tsunami
- isu tuhan
- isu tun daim zainuddin
- isu tunku ismail idris
- isu turki
- isu twitter
- isu u mobile
- isu uem
- isu uia
- isu uitm
- isu ukm
- isu ulama
- isu ulamak
- isu um
- isu umno
- isu undi pos
- isu undi rosak
- isu unifi
- isu unikl
- isu unimas
- isu unit khas teknologi tinggi
- isu unit pemodenan tadbiran perancangan pengurusan malaysia
- isu unit penyelarasan pelaksanaan
- isu unit perancang ekonomi
- isu united kingdom
- isu universiti
- isu upm
- isu usm
- isu ustaz
- isu ustazah
- isu utp
- isu vaksin
- isu valve corporation
- isu veveonah
- isu vietnam
- isu wan azizah
- isu whatsapp
- isu wisma
- isu world cup
- isu yaman
- isu yang di-pertuan agong
- isu yayasan hijau malaysia
- isu youtube rewind
- isu youtube
- isu ytl
- isu zakir naik
- isu zeti aziz
- jambu
- jiwa
- jururawat
- jurutera
- kacau
- kambing
- kampus
- kanak kanak
- kapitalis
- kecerdasan buatan
- kedai bayi
- kedai elektronik
- kedai haiwan
- kedai kain
- kedai kereta
- kedai makan
- kedai minumam
- kedai minuman
- kedai perabot
- kedai permainan
- kedai telefon
- kedai ubat
- kedai urut
- kelahiran jesus
- kelapa
- kelaparan
- kelawar
- kemalangan
- kemarau
- kerajaan adil
- kerajaan prihatin
- kerajaan sayang
- kerajaan zalim
- kertas penyelidikan
- kes dera
- kes positif
- ketupat
- kewangan islam
- komunis
- komunisme
- kopi
- kosmetik
- kubur
- kucing
- kuda
- kuliah
- kurang mampu
- landak
- langsuir
- lapangan terbang
- lebuh rajaya
- lelaki maut
- lelaki
- lemang
- lembu
- licin
- lohong hitam
- lontong
- lumba basikal
- lumba kuda
- makanan segera
- mata air
- mata wang digital
- mata wang kripto
- mata wang malaysia
- mata wang
- memanah
- menembak
- menganggur
- mesin judi
- mimpi
- monyet
- muflis
- musang
- najib razak bersalah
- najib razak mahkamah
- najib razak rasuah
- nangka
- nasional berhad
- nira nipah
- olahraga
- orang awam
- orang gila
- orang kurang upaya
- orang minyak
- parti bersatu
- pelesit
- peluang pekerjaan
- pembalakan kelantan
- pembalakan
- pembaziran
- pencemaran air
- pencemaran udara
- penganggur
- pengaturcaraan
- pensyarah
- penyakit misteri
- peracunan
- perahu layar
- perayaan Hari Gawai
- perempuan
- peretas
- permainan
- perpustakaan
- pesawat
- piala dunia
- pinjaman bank
- pinjaman islam
- pinjaman peribadi
- pocong
- pontianak
- populate-news-sentiment
- populate-news
- ragbi
- rambutan
- rasuah 1mdb
- rasuah afrika
- rasuah amerika
- rasuah anwar
- rasuah arab
- rasuah barisan nasional
- rasuah donald trump
- rasuah israel
- rasuah johor
- rasuah kelantan
- rasuah mahathir
- rasuah najib
- rasuah pas
- rasuah penang
- rasuah perlis
- rasuah pkr
- rasuah rosmah
- rasuah singapore
- rasuah thailand
- rasuah umno
- remaja
- rendang
- rumah tangga
- rusa
- rusia
- saham syarikat
- sanitasi
- sejarah islam
- sejarah nabi
- silat
- singa
- skandal boyband
- skandal kpop
- sosialis
- strategi bisnes
- strategi perniagaan
- suara wanita
- sukan elektronik
- swasta
- tak masuk akal
- tanda kiamat
- tebu
- tenaga nasional
- tenaga
- terbaring
- tinju
- toyol
- trafik
- transaksi
- tunggang agama
- ujian klinikal
- vaksin
- verifikasi
- wanita maut
- warga berharap
- zirafah
Articles
Total size: 3.1 MB
- Filem
- Kerajaan
- Pembelajaran
- Pendidikan
- Sekolah
Headline
Total size: 555.6 MB
Natural Language Query
SPIDER
Total size: 99.4 MB
{'db_id': 'concert_singer',
'query': 'SELECT count(*) FROM singer',
'query_toks': ['SELECT', 'count', '(', '*', ')', 'FROM', 'singer'],
'query_toks_no_value': ['select', 'count', '(', '*', ')', 'from', 'singer'],
'question': 'How many singers do we have?',
'question_toks': ['How', 'many', 'singers', 'do', 'we', 'have', '?'],
'sql': {'except': None,
'from': {'conds': [], 'table_units': [['table_unit', 1]]},
'groupBy': [],
'having': [],
'intersect': None,
'limit': None,
'orderBy': [],
'select': [False, [[3, [0, [0, 0, False], None]]]],
'union': None,
'where': []},
'question_bahasa': 'Berapa banyak penyanyi yang kita ada?'}
@inproceedings{Yu&al.18c,
title = {Spider: A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task},
author = {Tao Yu and Rui Zhang and Kai Yang and Michihiro Yasunaga and Dongxu Wang and Zifan Li and James Ma and Irene Li and Qingning Yao and Shanelle Roman and Zilin Zhang and Dragomir Radev}
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
year = 2018
}
COSQL
Total size: 105.5 MB
{'db_id': 'concert_singer',
'query': 'SELECT count(*) FROM singer',
'query_toks': ['SELECT', 'count', '(', '*', ')', 'FROM', 'singer'],
'query_toks_no_value': ['select', 'count', '(', '*', ')', 'from', 'singer'],
'question': 'How many singers do we have?',
'question_toks': ['How', 'many', 'singers', 'do', 'we', 'have', '?'],
'sql': {'except': None,
'from': {'conds': [], 'table_units': [['table_unit', 1]]},
'groupBy': [],
'having': [],
'intersect': None,
'limit': None,
'orderBy': [],
'select': [False, [[3, [0, [0, 0, False], None]]]],
'union': None,
'where': []},
'question_bahasa': 'Berapa banyak penyanyi yang kita ada?'}
@article{DBLP:journals/corr/abs-1909-05378,
author = {Tao Yu and
Rui Zhang and
Heyang Er and
Suyi Li and
Eric Xue and
Bo Pang and
Xi Victoria Lin and
Yi Chern Tan and
Tianze Shi and
Zihan Li and
Youxuan Jiang and
Michihiro Yasunaga and
Sungrok Shim and
Tao Chen and
Alexander R. Fabbri and
Zifan Li and
Luyao Chen and
Yuwen Zhang and
Shreya Dixit and
Vincent Zhang and
Caiming Xiong and
Richard Socher and
Walter S. Lasecki and
Dragomir R. Radev},
title = {CoSQL: {A} Conversational Text-to-SQL Challenge Towards Cross-Domain
Natural Language Interfaces to Databases},
journal = {CoRR},
volume = {abs/1909.05378},
year = {2019},
url = {http://arxiv.org/abs/1909.05378},
archivePrefix = {arXiv},
eprint = {1909.05378},
timestamp = {Wed, 12 May 2021 16:44:19 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1909-05378.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
SPARC
Total size: 100.3 MB
{'db_id': 'concert_singer',
'query': 'SELECT count(*) FROM singer',
'query_toks': ['SELECT', 'count', '(', '*', ')', 'FROM', 'singer'],
'query_toks_no_value': ['select', 'count', '(', '*', ')', 'from', 'singer'],
'question': 'How many singers do we have?',
'question_toks': ['How', 'many', 'singers', 'do', 'we', 'have', '?'],
'sql': {'except': None,
'from': {'conds': [], 'table_units': [['table_unit', 1]]},
'groupBy': [],
'having': [],
'intersect': None,
'limit': None,
'orderBy': [],
'select': [False, [[3, [0, [0, 0, False], None]]]],
'union': None,
'where': []},
'question_bahasa': 'Berapa banyak penyanyi yang kita ada?'}
@article{DBLP:journals/corr/abs-1906-02285,
author = {Tao Yu and
Rui Zhang and
Michihiro Yasunaga and
Yi Chern Tan and
Xi Victoria Lin and
Suyi Li and
Heyang Er and
Irene Li and
Bo Pang and
Tao Chen and
Emily Ji and
Shreya Dixit and
David Proctor and
Sungrok Shim and
Jonathan Kraft and
Vincent Zhang and
Caiming Xiong and
Richard Socher and
Dragomir R. Radev},
title = {SParC: Cross-Domain Semantic Parsing in Context},
journal = {CoRR},
volume = {abs/1906.02285},
year = {2019},
url = {http://arxiv.org/abs/1906.02285},
archivePrefix = {arXiv},
eprint = {1906.02285},
timestamp = {Wed, 12 May 2021 16:44:19 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1906-02285.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
Normalization
Rumi-to-Jawi
Originally from https://www.ejawi.net/converterV2.php?go=rumi
Total size: 1.4 GB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Rumi-to-Jawi Dataset,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/normalization/rumi-jawi}}
}
Stemmer
Total size: 80 MB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Stemming and Lemmatization Dataset,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/normalization/stemmer}}
}
IIUM Confession
Total size: 406 MB
Optical Character Recognition
Malay-to-Jawi
Total size: 445.3 MB
Dataset is simple, malay label can get from the name idola.png.
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Malay-to-Jawi Dataset,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/normalization/stemmer}}
}
Malay handwriting (Satisfy-Regular)
Total size: 194.4 MB
Dataset is simple, malay label can get from the name syarif.png.
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Generated Handwriting Dataset,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/ocr/handwriting}}
}
Paraphrase
General
Total size: 31.0 MB
Extract from MS COCO Captions.
@article{DBLP:journals/corr/LinMBHPRDZ14,
author = {Tsung{-}Yi Lin and
Michael Maire and
Serge J. Belongie and
Lubomir D. Bourdev and
Ross B. Girshick and
James Hays and
Pietro Perona and
Deva Ramanan and
Piotr Doll{\'{a}}r and
C. Lawrence Zitnick},
title = {Microsoft {COCO:} Common Objects in Context},
journal = {CoRR},
volume = {abs/1405.0312},
year = {2014},
url = {http://arxiv.org/abs/1405.0312},
archivePrefix = {arXiv},
eprint = {1405.0312},
timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},
biburl = {https://dblp.org/rec/journals/corr/LinMBHPRDZ14.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
Funpedia
Total size: 68.8 MB
@article{DBLP:journals/corr/MillerFFLBBPW17,
author = {Alexander H. Miller and
Will Feng and
Adam Fisch and
Jiasen Lu and
Dhruv Batra and
Antoine Bordes and
Devi Parikh and
Jason Weston},
title = {ParlAI: {A} Dialog Research Software Platform},
journal = {CoRR},
volume = {abs/1705.06476},
year = {2017},
url = {http://arxiv.org/abs/1705.06476},
archivePrefix = {arXiv},
eprint = {1705.06476},
timestamp = {Mon, 13 Aug 2018 16:47:16 +0200},
biburl = {https://dblp.org/rec/journals/corr/MillerFFLBBPW17.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
Reference: https://github.com/facebookresearch/ParlAI/tree/master/parlai/tasks/funpedia
ParaSCI
Total size: 177 MB
@misc{dong2021parasci,
title={ParaSCI: A Large Scientific Paraphrase Dataset for Longer Paraphrase Generation},
author={Qingxiu Dong and Xiaojun Wan and Yue Cao},
year={2021},
eprint={2101.08382},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
PAWS
Total size: 16 MB
@misc{zhang2019paws,
title={PAWS: Paraphrase Adversaries from Word Scrambling},
author={Yuan Zhang and Jason Baldridge and Luheng He},
year={2019},
eprint={1904.01130},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
Semisupervised Academia
Total size: 73.7 MB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Semisupervised Academia.edu Paraphrases using T5-Bahasa,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/paraphrase/semisupervised-academia}}
}
Semisupervised News
Total size: 311.3 MB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Semisupervised Bahasa News Paraphrases using T5-Bahasa,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/paraphrase/semisupervised-academia}}
}
Semisupervised Wikipedia
Total size: 233.4 MB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Semisupervised Bahasa Wikipedia Paraphrases using T5-Bahasa,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/paraphrase/semisupervised-academia}}
}
Parsing
Constituency
Total size: 3.5 MB
Jessica Naraiswari Arwidarasti, Ika Alfina, Adila Alfa Krisnadhi, "Adjusting Indonesian Multiword Expression Annotation to the Penn Treebank Format", Asian Language Processing (IALP) 2020 International Conference on, pp. 75-80, 2020.
Dependency
Total size: 24.1 MB
@misc{ud_indonesian-pud, title={UD Indonesian PUD}, url={https://universaldependencies.org/treebanks/id_pud/index.html}, journal={UD_Indonesian-PUD}}
Phoneme
Total size: 57 KB
Question-Answer
Common Crawl QA
Total size: 328 MB
Extractive News QA
Total size: 216 MB
Hansard QA
Total size: 365 MB
General
Total size: 2.5 MB
1 mary pergi ke taman. 2 mary pergi ke dapur. 3 husein kembali ke pejabat.
4 husein perjalanan ke lorong. 5 jeff kembali ke bilik tidur. 6 fred berpindah ke lorong.
7 husein berpindah ke bilik mandi. 8 jeff kembali ke taman. 9 jeff kembali ke dapur.
10 fred kembali ke taman. 11 mary mendapat bola sepak di sana. 12 mary menyerahkan bola sepak kepada jeff.
13 apa yang mary berikan kepada jeff? <> bola sepak <> 12.
14 husein kembali ke lorong. 15 jeff kembali ke bilik tidur. 16 apa yang mary berikan kepada jeff? <> bola sepak <> 12.
17 fred berpindah ke bilik mandi. 18 mary mengambil susu di sana. 19 apa yang mary berikan kepada jeff? <> bola sepak <> 12.
20 fred pergi ke dapur. 21 mary menyerahkan susu itu kepada fred. 22 siapa yang memberikan susu itu kepada fred? <> mary <> 21.
23 fred berpindah ke lorong. 24 jeff pergi ke pejabat. 25 siapa yang mary memberikan susu itu? <> fred <> 21
SQUAD
Total size: 129.1MB
@article{DBLP:journals/corr/abs-1806-03822,
author = {Pranav Rajpurkar and
Robin Jia and
Percy Liang},
title = {Know What You Don't Know: Unanswerable Questions for SQuAD},
journal = {CoRR},
volume = {abs/1806.03822},
year = {2018},
url = {http://arxiv.org/abs/1806.03822},
archivePrefix = {arXiv},
eprint = {1806.03822},
timestamp = {Mon, 13 Aug 2018 16:48:21 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1806-03822.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
Natural Questions
Total size: 8MB
@article{47761,
title = {Natural Questions: a Benchmark for Question Answering Research},
author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},
year = {2019},
journal = {Transactions of the Association of Computational Linguistics}
}
Segmentation
Total size: 2.2 GB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Segmentation Augmentation,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/segmentation}}
}
Sentiment
Local News
Total size: 496 KB
- Positive
- Negative
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Supervised Sentiment for Bahasa News,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/sentiment/news-sentiment}}
}
Semisupervised Twitter
Total size: 25.3 MB
Stack XLNET BASE + BERT BASE on Supervised Twitter and Supervised Twitter Politics.
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Semi-Supervised Sentiment for Bahasa Twitter,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/sentiment/semi-supervised-twitter}}
}
Supervised Twitter
Total size: 366 KB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Supervised Sentiment for Bahasa Twitter,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/sentiment/supervised-twitter}}
}
Supervised Twitter Politics
Total size: 223 KB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Supervised Sentiment for Bahasa Twitter Politics,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/sentiment/supervised-twitter}}
}
Spelling Correction
Neuspell
Total size: 1.2 GB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Spelling Correction Augmentation,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/spelling-correction/neuspell}}
}
Summarization
CNN News
Total size: 900 MB
@article{DBLP:journals/corr/SeeLM17,
author = {Abigail See and
Peter J. Liu and
Christopher D. Manning},
title = {Get To The Point: Summarization with Pointer-Generator Networks},
journal = {CoRR},
volume = {abs/1704.04368},
year = {2017},
url = {http://arxiv.org/abs/1704.04368},
archivePrefix = {arXiv},
eprint = {1704.04368},
timestamp = {Mon, 13 Aug 2018 16:46:08 +0200},
biburl = {https://dblp.org/rec/journals/corr/SeeLM17.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
DailyMail
Total size: 2.1 GB
@article{DBLP:journals/corr/SeeLM17,
author = {Abigail See and
Peter J. Liu and
Christopher D. Manning},
title = {Get To The Point: Summarization with Pointer-Generator Networks},
journal = {CoRR},
volume = {abs/1704.04368},
year = {2017},
url = {http://arxiv.org/abs/1704.04368},
archivePrefix = {arXiv},
eprint = {1704.04368},
timestamp = {Mon, 13 Aug 2018 16:46:08 +0200},
biburl = {https://dblp.org/rec/journals/corr/SeeLM17.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
Gigawords
Total size: 450 MB
@article{graff2003english,
title={English gigaword},
author={Graff, David and Kong, Junbo and Chen, Ke and Maeda, Kazuaki},
journal={Linguistic Data Consortium, Philadelphia},
volume={4},
number={1},
pages={34},
year={2003}
}
@article{Rush_2015,
title={A Neural Attention Model for Abstractive Sentence Summarization},
url={http://dx.doi.org/10.18653/v1/D15-1044},
DOI={10.18653/v1/d15-1044},
journal={Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing},
publisher={Association for Computational Linguistics},
author={Rush, Alexander M. and Chopra, Sumit and Weston, Jason},
year={2015}
}
Multinews
Total size: 680 MB
@article{DBLP:journals/corr/abs-1906-01749,
author = {Alexander R. Fabbri and
Irene Li and
Tianwei She and
Suyi Li and
Dragomir R. Radev},
title = {Multi-News: a Large-Scale Multi-Document Summarization Dataset and
Abstractive Hierarchical Model},
journal = {CoRR},
volume = {abs/1906.01749},
year = {2019},
url = {http://arxiv.org/abs/1906.01749},
archivePrefix = {arXiv},
eprint = {1906.01749},
timestamp = {Thu, 13 Jun 2019 13:36:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1906-01749.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
Semisupervised AstroAwani
Abstractive output from T5-base-bahasa summarized astroawani news.
Total size: 364.69 MB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Semisupervised Bahasa News Summarization using T5-Bahasa,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/summarization/semisupervised}}
}
Semisupervised News
Abstractive output from T5-base-bahasa summarized 100k local news.
Total size: 303 MB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Semisupervised Bahasa News Summarization using T5-Bahasa,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/summarization/semisupervised}}
}
Xwikis
Total size: 6270.8 MB
@article{DBLP:journals/corr/SeeLM17,
author = {Abigail See and
Peter J. Liu and
Christopher D. Manning},
title = {Get To The Point: Summarization with Pointer-Generator Networks},
journal = {CoRR},
volume = {abs/1704.04368},
year = {2017},
url = {http://arxiv.org/abs/1704.04368},
archivePrefix = {arXiv},
eprint = {1704.04368},
timestamp = {Mon, 13 Aug 2018 16:46:08 +0200},
biburl = {https://dblp.org/rec/journals/corr/SeeLM17.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
Tagging
Part-of-Speech
Total size: 3.1 MB
- ADJ - Adjective, kata sifat
- ADP - Adposition
- ADV - Adverb, kata keterangan
- ADX - Auxiliary verb, kata kerja tambahan
- CCONJ - Coordinating conjuction, kata hubung
- DET - Determiner, kata penentu
- NOUN - Noun, kata nama
- NUM - Number, nombor
- PART - Particle
- PRON - Pronoun, kata ganti
- PROPN - Proper noun, kata ganti nama khas
- SCONJ - Subordinating conjunction
- SYM - Symbol
- VERB - Verb, kata kerja
- X - Other
@misc{ud_indonesian-pud, title={UD Indonesian PUD}, url={https://universaldependencies.org/treebanks/id_pud/index.html}, journal={UD_Indonesian-PUD}}
Augmentation,
Entities
Total size: 3.1 MB
- OTHER - Other
- law - law, regulation, related law documents, documents, etc
- location - location, place
- organization - organization, company, government, facilities, etc
- person - person, group of people, believes, etc
- quantity - numbers, quantity
- time - date, day, time, etc
- event - unique event happened, etc
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Augmentation Indonesian Entities using Rules based,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/tagging/entities}}
}
Augmentation,
Semisupervised Entities Parliament
Voting stack using Malaya entities models on Parliament texts.
Total size: 129 MB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Semi-Supervised Entities for Parliament texts,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/semi-supervised/twitter}}
}
Text similarity
Quora
Total size: 60.8 MB
@misc{kaggle, title={Quora Question Pairs}, url={https://www.kaggle.com/c/quora-question-pairs}, journal={Kaggle}}
SNLI
Total size: 256.8 MB
Samuel R. Bowman, Gabor Angeli, Christopher Potts, and Christopher D. Manning. 2015. A large annotated corpus for learning natural language inference. In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing (EMNLP). [pdf] [bib]
MNLI
Total size: 286.2 MB
@InProceedings{N18-1101,
author = "Williams, Adina
and Nangia, Nikita
and Bowman, Samuel",
title = "A Broad-Coverage Challenge Corpus for
Sentence Understanding through Inference",
booktitle = "Proceedings of the 2018 Conference of
the North American Chapter of the
Association for Computational Linguistics:
Human Language Technologies, Volume 1 (Long
Papers)",
year = "2018",
publisher = "Association for Computational Linguistics",
pages = "1112--1122",
location = "New Orleans, Louisiana",
url = "http://aclweb.org/anthology/N18-1101"
}
Tokenization
Syllable
Gathered from https://prpm.dbp.gov.my/
Total size: 2 MB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Syllable tokenization,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/tokenization/syllable}}
}
Translation
ChatGPT3.5 b.cari.com.my
Total size: 750 MB
ChatGPT3.5 c.cari.com.my
Total size: 750 MB
ChatGPT3.5 Facebook
Total size: 53.1 MB
ChatGPT3.5 IIUM Confession
Total size: 426.86 MB
ChatGPT3.5 Manglish
Total size: 351 MB
ChatGPT3.5 NLLB-BJN
Total size: 210 MB
ChatGPT3.5 Twitter
Total size: 16 MB
EN-MS Alignment
a black cat
kucing hitam
-> 1-1 2-0
Provided Forward and Reversed alignment.
Total size: 6.1 GB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Alignment EN-MS,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/translation/english-news}}
}
IIUM-Confession
Malay to English.
Total size: 562 KB
Google Translate MS-EN
Total size: 935.3 MB
Opus
Parsed from http://opus.nlpl.eu/, ms (Malay) -> en (English)
Total size: 262.6 MB
@InProceedings{TIEDEMANN12.463,
author = {Jörg Tiedemann},
title = {Parallel Data, Tools and Interfaces in OPUS},
booktitle = {Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)},
year = {2012},
month = {may},
date = {23-25},
address = {Istanbul, Turkey},
editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Mehmet Ugur Dogan and Bente Maegaard and Joseph Mariani and Jan Odijk and Stelios Piperidis},
publisher = {European Language Resources Association (ELRA)},
isbn = {978-2-9517408-7-7},
language = {english}
}
Parliament
Parsed from Malaysia parliament text, and translate to English.
Total size: 47.6 MB
Local Movies Subtitles
Total size: 11.4 MB
English News
English to Malay.
Total size: 2.5 GB
Long text
Malay to English. Focused on long text translation.
Total size: 1.7 GB
EN-MS Alignment
EN-MS Alignment using using eflomal.
Total size: 300 MB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Alignment EN-MS,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/translation/en-ms-alignment}}
}
MS-EN Alignment
MS-EN Alignment using using eflomal.
Total size: 300 MB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Alignment MS-EN,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/translation/ms-en-alignment}}
}
Noisy MS-EN Augmentation
Augment using social media lexicon and english replacement using word alignment.
Total size: 721 MB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Noisy MS-EN Augmentation,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/translation/noisy-ms-en-augmentation}}
}
Noisy EN-MS Augmentation
Augment using social media lexicon and english replacement using word alignment.
Total size: 433.4 MB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Noisy EN-MS Augmentation,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/translation/noisy-en-ms-augmentation}}
}
NLLB-EN-MS
Total size: 2065 MB
@misc{https://doi.org/10.48550/arxiv.2207.04672,
doi = {10.48550/ARXIV.2207.04672},
url = {https://arxiv.org/abs/2207.04672},
author = {{NLLB Team} and Costa-jussà , Marta R. and Cross, James and Çelebi, Onur and Elbayad, Maha and Heafield, Kenneth and Heffernan, Kevin and Kalbassi, Elahe and Lam, Janice and Licht, Daniel and Maillard, Jean and Sun, Anna and Wang, Skyler and Wenzek, Guillaume and Youngblood, Al and Akula, Bapi and Barrault, Loic and Gonzalez, Gabriel Mejia and Hansanti, Prangthip and Hoffman, John and Jarrett, Semarley and Sadagopan, Kaushik Ram and Rowe, Dirk and Spruit, Shannon and Tran, Chau and Andrews, Pierre and Ayan, Necip Fazil and Bhosale, Shruti and Edunov, Sergey and Fan, Angela and Gao, Cynthia and Goswami, Vedanuj and Guzmán, Francisco and Koehn, Philipp and Mourachko, Alexandre and Ropers, Christophe and Saleem, Safiyyah and Schwenk, Holger and Wang, Jeff},
keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences, I.2.7, 68T50},
title = {No Language Left Behind: Scaling Human-Centered Machine Translation},
publisher = {arXiv},
year = {2022},
copyright = {Creative Commons Attribution Share Alike 4.0 International}
}
NLLB-MS-JAV
Total size: 1165.92 MB
@misc{https://doi.org/10.48550/arxiv.2207.04672,
doi = {10.48550/ARXIV.2207.04672},
url = {https://arxiv.org/abs/2207.04672},
author = {{NLLB Team} and Costa-jussà , Marta R. and Cross, James and Çelebi, Onur and Elbayad, Maha and Heafield, Kenneth and Heffernan, Kevin and Kalbassi, Elahe and Lam, Janice and Licht, Daniel and Maillard, Jean and Sun, Anna and Wang, Skyler and Wenzek, Guillaume and Youngblood, Al and Akula, Bapi and Barrault, Loic and Gonzalez, Gabriel Mejia and Hansanti, Prangthip and Hoffman, John and Jarrett, Semarley and Sadagopan, Kaushik Ram and Rowe, Dirk and Spruit, Shannon and Tran, Chau and Andrews, Pierre and Ayan, Necip Fazil and Bhosale, Shruti and Edunov, Sergey and Fan, Angela and Gao, Cynthia and Goswami, Vedanuj and Guzmán, Francisco and Koehn, Philipp and Mourachko, Alexandre and Ropers, Christophe and Saleem, Safiyyah and Schwenk, Holger and Wang, Jeff},
keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences, I.2.7, 68T50},
title = {No Language Left Behind: Scaling Human-Centered Machine Translation},
publisher = {arXiv},
year = {2022},
copyright = {Creative Commons Attribution Share Alike 4.0 International}
}
True Case
Build custom true case augmentation.
Total size: 8.9 GB
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, True Case Augmentation,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/truecase}}
}