;;; Nani Project website ;;; Copyright © 2022 Julien Lepiller ;;; ;;; This file is part of the Nani Project website. ;;; ;;; The Nani Project website is free software; you can redistribute it and/or modify it ;;; under the terms of the GNU Affero General Public License as published by ;;; the Free Software Foundation; either version 3 of the License, or (at ;;; your option) any later version. ;;; ;;; The Nani Project website is distributed in the hope that it will be useful, but ;;; WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;;; GNU Affero General Public License for more details. ;;; ;;; You should have received a copy of the GNU Affero General Public License ;;; along with the Nani Project website. If not, see . (use-modules (srfi srfi-9)) (use-modules (srfi srfi-11)) (use-modules (ice-9 match)) (use-modules (ice-9 binary-ports)) (use-modules (ice-9 textual-ports)) (use-modules (nani sentence sentence)) (define (tatoeba-file file) (string-append (dirname (current-filename)) "/../dictionaries/tatoeba_" file)) (define (get-sentences lang native-users) (call-with-input-file (tatoeba-file "sentences_detailed.csv") (lambda (port) (let loop ((jpn '()) (trans '()) (line (get-line port))) (if (eof-object? line) (values (reverse jpn) (reverse trans)) (match (string-split line #\tab) ((id slang text user _ _) (cond ((equal? lang slang) (loop jpn (cons (cons (string->number id) text) trans) (get-line port))) ((and (equal? slang "jpn") (member user native-users)) (loop (cons (cons (string->number id) text) jpn) trans (get-line port))) (else (loop jpn trans (get-line port))))))))))) (define (get-native-jpn-users) (call-with-input-file (tatoeba-file "user_languages.csv") (lambda (port) (let loop ((jpn '()) (line (get-line port))) (if (eof-object? line) jpn (match (string-split line #\tab) ((ulang level user _) ;; consume the comment if it's on multiple lines (let loop2 ((line line)) (when (string-suffix? "\\" line) (loop2 (get-line port)))) ;; Add native japanese user (if (and (equal? ulang "jpn") (equal? level "5")) (loop (cons user jpn) (get-line port)) (loop jpn (get-line port)))))))))) (define (get-translations jpn trans) (define jpn-len (vector-length jpn)) (define trans-len (vector-length trans)) (define (member-vect elem vect len) (let loop ((min 0) (max (- len 1))) (if (> min max) #f (let* ((mid-pos (floor (+ min (/ (- max min) 2)))) (mid-elem (vector-ref vect mid-pos))) (cond ((equal? mid-elem elem) #t) ((> mid-elem elem) (loop min (- mid-pos 1))) ((< mid-elem elem) (loop (+ mid-pos 1) max))))))) (call-with-input-file (tatoeba-file "sentences_base.csv") (lambda (port) (let loop ((translations '()) (line (get-line port))) (if (or (eof-object? line) (null? trans)) translations (match (string-split line #\tab) ((id translation) (cond ((and (string->number id) (member-vect (string->number id) trans trans-len) (string->number translation) (member-vect (string->number translation) jpn jpn-len)) (loop (cons (cons (string->number translation) (string->number id)) translations) (get-line port))) ((and (string->number id) (member-vect (string->number id) jpn jpn-len) (string->number translation) (member-vect (string->number translation) trans trans-len)) (loop (cons (cons (string->number id) (string->number translation)) translations) (get-line port))) (else (loop translations (get-line port))))))))))) (define (add-tags translations trans jpn) (define tags (call-with-input-file (tatoeba-file "tags.csv") (lambda (port) (let loop ((tags '()) (line (get-line port))) (if (eof-object? line) tags (match (string-split line #\tab) ((id tag) (assoc-set! tags (string->number id) (cons tag (or (assoc-ref tags (string->number id)) '())))))))))) (map (match-lambda ((jpn-id . trans-id) (let ((tags (or (assoc-ref tags jpn-id) '())) (trans (assoc-ref trans trans-id)) (jpn (assoc-ref jpn jpn-id))) (make-sentence jpn trans tags #f)))) translations)) (define (get-tatoeba-sentences lang) (define native-users (get-native-jpn-users)) (let-values (((jpn trans) (get-sentences lang native-users))) (format #t "jpn: ~a sentences~%" (length jpn)) (format #t "~a: ~a sentences~%" lang (length trans)) (let ((translations (get-translations (list->vector (sort (map car jpn) <)) (list->vector (sort (map car trans) <))))) (format #t "~a pairs~%" (length translations)) (add-tags translations trans jpn)))) (match (command-line) ((_ lang output) (let ((sentences (get-tatoeba-sentences lang))) (format #t "Number of entries in ~a: ~a~%" output (length sentences)) (call-with-output-file output (lambda (port) (put-bytevector port (serialize-sentence-dictionary sentences)))))))