@mastersthesis{digilib72937, month = {August}, title = {KOMPARASI PERFORMA LARGE LANGUAGE MODELS UNTUK TUGAS PERINGKASAN TEKS BERBAHASA INDONESIA}, school = {UIN SUNAN KALIJAGA YOGYAKARTA}, author = {NIM.: 22206051019 R. Abdullah Hammami}, year = {2025}, note = {Dr. Agung Fatwanto, S.Si., M.Kom.}, keywords = {Peringkasan Teks, Fine-tuning, Gemma2, LLaMA3}, url = {https://digilib.uin-suka.ac.id/id/eprint/72937/}, abstract = {The rapid growth of online information, coupled with low reading interest and heterogeneous literacy levels in Indonesia, necessitates concise, accurate, and context-sensitive automatic summarization. Given Indonesian?s low-resource status, systematic evaluation of locally adapted models is warranted. This study compares four Indonesian-capable large language models{--}Gemma2 9B CPT Sahabat-AI v1 Instruct, Llama3 8B CPT Sahabat-AI v1 Instruct, Gemma-SEA-LION-v3-9B-IT, and Llama-SEA-LION-v3-8B-IT{--}on news summarization to identify the most suitable model for practical use. We employ a benchmarking protocol on the IndoSum test subset (3,762 articles), comprising preprocessing (token reconstruction and punctuation cleanup), prompt design, 8-bit quantized inference, and automated evaluation with ROUGE (1/2/L; precision, recall, F1), BLEU, METEOR, and BERTScore. Inference is executed in four batches to meet computational constraints, and evaluation is standardized across models. Llama3 8B CPT Sahabat-AI v1 Instruct achieves the most balanced performance: ROUGE F1 42.05\% (precision 42.27\%; recall 42.68\%), BLEU 25.10\%, and BERTScore P/R/F1 88.68\%/88.43\%/88.54\%. Gemma2 9B CPT Sahabat-AI v1 Instruct excels in coverage with ROUGE recall 48.23\%, ROUGE F1 39.50\%, BLEU 22.70\%, METEOR 47.20\%, and BERTScore 86.78\%/89.17\%/87.95\%. SEA-LION models perform lower: Gemma-SEA-LION-v3-9B-IT (ROUGE P/R/F1 25.77\%/37.58\%/30.37\%; BLEU 12.65\%; METEOR 37.72\%; BERTScore 84.63\%/87.36\%/85.97\%) and Llama-SEA-LION-v3-8B-IT (ROUGE 25.22\%/33.84\%/28.71\%; BLEU 11.06\%; METEOR 34.57\%; BERTScore 84.46\%/86.80\%/85.61\%). Overall, Indonesian-optimized models (SahabatAI) are superior and more stable. Llama3 8B is preferable when balancing precision, coverage, and structural consistency; Gemma2 9B is better when recall and semantic alignment with the source are prioritized.} }