nani/website

Julien LepillerSat Jul 09 21:45:24+0200 2022

3138fbd

Add tatoeba

Makefile

 include kanjidic.mk
 include kanjivg.mk
 include radicals.mk
 include tatoeba.mk
 include wadoku.mk
 # Files that constitute the website
 	touch site
 download:
 	@rm -f dictionaries/*
 	@rm -rf dictionaries/*
 	@$(MAKE) $(DOWNLOADS)
 po/%/LC_MESSAGES/nani.mo: po/%.po

manifest.scm

6	6
7	7		; for all
8	8		"gettext"))
	9	+

modules/nani/sentence/sentence.scm unknown status 1

 ;;; Nani Project website
 ;;; Copyright ?? 2022 Julien Lepiller <julien@lepiller.eu>
 ;;;
 ;;; This file is part of the Nani Project website.
 ;;;
 ;;; The Nani Project website is free software; you can redistribute it and/or modify it
 ;;; under the terms of the GNU Affero General Public License as published by
 ;;; the Free Software Foundation; either version 3 of the License, or (at
 ;;; your option) any later version.
 ;;;
 ;;; The Nani Project website is distributed in the hope that it will be useful, but
 ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ;;; GNU Affero General Public License for more details.
 ;;;
 ;;; You should have received a copy of the GNU Affero General Public License
 ;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
 (define-module (nani sentence sentence)
   #:use-module (ice-9 binary-ports)
   #:use-module (ice-9 match)
   #:use-module (rnrs bytevectors)
   #:use-module (srfi srfi-9)
   #:use-module (nani encoding serialize)
   #:use-module (nani encoding huffman)
   #:use-module (nani encoding trie)
   #:use-module (mecab mecab)
   #:export (make-sentence
             sentence?
             sentence-jpn
             sentence-trans
             sentence-tags
             sentence-audio
             serialize-sentence
             serialize-sentence-dictionary
             sentence-dictionary-entry-count))
 (define-record-type <sentence>
   (make-sentence jpn trans tags audio)
   sentence?
   (position sentence-position sentence-position-set!)
   (jpn      sentence-jpn)
   (trans    sentence-trans)
   (tags     sentence-tags)
   (audio    sentence-audio))
 (define (serialize-audio filename pos bv)
   (let ((size (if filename (stat:size (stat filename)) 0)))
     (let ((pos (serialize-u16 size pos bv)))
       (if filename
         (let ((fbv (call-with-input-file filename get-bytevector-all)))
           (bytevector-copy! fbv 0 bv pos size)
           (+ pos size))
         pos))))
 (define (audio-size filename)
   (let ((size (if filename (stat:size (stat filename)) 0)))
     (+ (u16-size size) size)))
 (define (serialize-sentence jpn-huffman trans-huffman)
   (lambda (sentence pos bv)
     (when (not (sentence? sentence)) (throw 'not-sentence sentence))
     (sentence-position-set! sentence pos)
     (let* ((pos ((serialize-huffman-string jpn-huffman)
                  (sentence-jpn sentence) pos bv))
            (pos ((serialize-huffman-string trans-huffman)
                  (sentence-trans sentence) pos bv))
            (pos ((serialize-list (serialize-huffman-string trans-huffman))
                  (sentence-tags sentence) pos bv))
            (pos (serialize-audio (sentence-audio sentence) pos bv)))
       pos)))
 (define (sentence-size jpn-huffman trans-huffman)
   (lambda (sentence)
     (when (not (sentence? sentence)) (throw 'not-sentence sentence))
     (+ ((huffman-string-size jpn-huffman) (sentence-jpn sentence))
        ((huffman-string-size trans-huffman) (sentence-trans sentence))
        ((list-size (huffman-string-size trans-huffman)) (sentence-tags sentence))
        (audio-size (sentence-audio sentence)))))
 (define (make-key key)
   (apply append
     (map
       (lambda (c)
         (list (quotient c 16) (modulo c 16)))
       (bytevector->u8-list (string->utf8 key)))))
 (define (update-trie-pos! trie sentences)
   (let* ((vals (trie-vals trie))
          (vals (map (lambda (i) (sentence-position (array-ref sentences i))) vals)))
     (trie-vals-set! trie vals))
   (for-each
     (match-lambda
       ((char . child)
        (update-trie-pos! child sentences)))
     (trie-transitions trie)))
 (define (serialize-sentence-dictionary sentences)
   (define jpn-huffman
     (let ((jpn (map sentence-jpn sentences)))
       (create-huffman jpn)))
   (define jpn-huffman-code (huffman->code jpn-huffman))
   (define trans-huffman
     (let ((trans (map sentence-trans sentences))
           (tags (apply append (map sentence-tags sentences))))
       (create-huffman (append trans tags))))
   (define trans-huffman-code (huffman->code trans-huffman))
   (define (make-sentence-trie sentences)
     (let ((trie (make-empty-trie))
           (tagger (mecab-new-tagger '())))
       (let loop ((sentences sentences) (i 0))
         (if (null? sentences)
           (begin
             (mecab-destroy tagger)
             (compress-trie trie))
           (begin
             (for-each
               (lambda (key)
                 (add-to-trie! trie (make-key key) i))
               (mecab-words tagger (sentence-jpn (car sentences))))
             (loop (cdr sentences) (+ i 1)))))))
   (define (trie-node-size trie)
     (apply + 1 (map trie-node-size (map cdr (trie-transitions trie)))))
   (let* ((header (string->utf8 "NANI_SENTENCE001"))
          (header-size (bytevector-length header))
          (pointers (make-bytevector 4 0))
          (jpn-huffman-bv (serialize-huffman jpn-huffman))
          (jpn-huffman-size (bytevector-length jpn-huffman-bv))
          (trans-huffman-bv (serialize-huffman trans-huffman))
          (trans-huffman-size (bytevector-length trans-huffman-bv))
          (serialize-trie (serialize-trie serialize-int int-size))
          (trie-size (trie-size int-size))
          (sentence-trie (make-sentence-trie sentences))
          (sentence-trie-size (trie-size sentence-trie))
          (sentences-size
            ((list-size (sentence-size jpn-huffman-code trans-huffman-code)
                        #:size? #f)
             sentences))
          (huffman-size (+ jpn-huffman-size trans-huffman-size))
          (pos-trie (+ header-size 4 jpn-huffman-size trans-huffman-size
                       sentences-size 4))
          (bv (make-bytevector (+ header-size 4 jpn-huffman-size
                                  trans-huffman-size sentences-size 4
                                  sentence-trie-size))))
     (format #t "Number of nodes in trie: ~a~%" (trie-node-size sentence-trie))
     ((serialize-list (serialize-sentence jpn-huffman-code trans-huffman-code)
                      #:size? #f)
      sentences (+ header-size 4 huffman-size) bv)
     ;; Serializing sentences also updated sentence-pos for each of them
     (let ((sentences (list->array 1 sentences)))
       (update-trie-pos! sentence-trie sentences))
     ;; number of entries
     (serialize-int (length sentences) (+ header-size 4 huffman-size sentences-size)
                    bv)
     (let* ((sentences (list->array 1 sentences))
            (pos pos-trie)
            (pos (serialize-trie sentence-trie pos bv)))
       ;; point to the trie structure
       (bytevector-u32-set!
         pointers 0
         (+ header-size 4 huffman-size sentences-size (int-size 0))
         (endianness big))
       ;; copy to result bytevector
       (bytevector-copy! header 0 bv 0 header-size)
       (bytevector-copy! pointers 0 bv header-size 4)
       (bytevector-copy! jpn-huffman-bv 0 bv (+ header-size 4) jpn-huffman-size)
       (bytevector-copy! trans-huffman-bv 0 bv (+ header-size 4 jpn-huffman-size)
                         trans-huffman-size)
       ;; gide some feedback on the size of file's structures
       (format #t "huffmans are ~a bytes long~%" huffman-size)
       (format #t "sentences are ~a bytes long~%" sentences-size)
       (format #t "trie is ~a bytes long~%" sentence-trie-size)
       bv)))
 (define (sentence-dictionary-entry-count file)
   (call-with-input-file file
     (lambda (port)
       (let* ((header (utf8->string (get-bytevector-n port 16)))
              (pointers (get-bytevector-n port 4))
              (end-pos (bytevector-u32-ref pointers 0 (endianness big))))
         (seek port (- end-pos 4) SEEK_SET)
         (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big))))))

po/nani.pot

 msgstr ""
 "Project-Id-Version: PACKAGE VERSION\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2021-07-29 21:50+0200\n"
 "POT-Creation-Date: 2022-07-09 21:44+0200\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: LANGUAGE <LL@li.org>\n"
 msgid "JMdict"
 msgstr ""
 #: tools/list.scm:71
 #: tools/list.scm:85
 msgid ""
 "Japanese/Dutch dictionary from the Electronic Dictionary Research and "
 "Development Group."
 msgstr ""
 #: tools/list.scm:70
 #: tools/list.scm:84
 msgid ""
 "Japanese/English dictionary from the Electronic Dictionary Research and "
 "Development Group."
 msgstr ""
 #: tools/list.scm:72
 msgid "Japanese/French aligned sentences from the Tatoeba project."
 msgstr ""
 #: tools/list.scm:86
 msgid ""
 "Japanese/French dictionary from the Electronic Dictionary Research and "
 "Development Group."
 msgstr ""
 #: tools/list.scm:61
 #: tools/list.scm:62
 msgid "Japanese/French dictionary from the Jibiki project."
 msgstr ""
 #: tools/list.scm:45
 #: tools/list.scm:46
 msgid "Japanese/German dictionary from Wadoku."
 msgstr ""
 #: tools/list.scm:73
 #: tools/list.scm:87
 msgid ""
 "Japanese/German dictionary from the Electronic Dictionary Research and "
 "Development Group."
 msgstr ""
 #: tools/list.scm:74
 #: tools/list.scm:88
 msgid ""
 "Japanese/Hungarian dictionary from the Electronic Dictionary Research and "
 "Development Group."
 msgstr ""
 #: tools/list.scm:75
 #: tools/list.scm:73
 msgid "Japanese/Russian aligned sentences from the Tatoeba project."
 msgstr ""
 #: tools/list.scm:89
 msgid ""
 "Japanese/Russian dictionary from the Electronic Dictionary Research and "
 "Development Group."
 msgstr ""
 #: tools/list.scm:76
 #: tools/list.scm:90
 msgid ""
 "Japanese/Slovenian dictionary from the Electronic Dictionary Research and "
 "Development Group."
 msgstr ""
 #: tools/list.scm:77
 #: tools/list.scm:74
 msgid "Japanese/Spanish aligned sentences from the Tatoeba project."
 msgstr ""
 #: tools/list.scm:91
 msgid ""
 "Japanese/Spanish dictionary from the Electronic Dictionary Research and "
 "Development Group."
 msgstr ""
 #: tools/list.scm:78
 #: tools/list.scm:92
 msgid ""
 "Japanese/Swedish dictionary from the Electronic Dictionary Research and "
 "Development Group."
 msgstr ""
 #: tools/list.scm:87
 #: tools/list.scm:75
 msgid "Japanese/Ukrainian aligned sentences from the Tatoeba project."
 msgstr ""
 #: tools/list.scm:71
 msgid "Japanese/english aligned sentences from the Tatoeba project."
 msgstr ""
 #: tools/list.scm:101
 msgid "Kanji dictionary with English meanings."
 msgstr ""
 #: tools/list.scm:89
 #: tools/list.scm:103
 msgid "Kanji dictionary with French meanings."
 msgstr ""
 #: tools/list.scm:90
 #: tools/list.scm:104
 msgid "Kanji dictionary with Portuguese meanings."
 msgstr ""
 #: tools/list.scm:88
 #: tools/list.scm:102
 msgid "Kanji dictionary with Spanish meanings."
 msgstr ""
 #: tools/list.scm:39
 #: tools/list.scm:40
 msgid "Kanji writing visual help by the Kanjivg project."
 msgstr ""
 msgid "Phone: "
 msgstr ""
 #: tools/list.scm:53
 #: tools/list.scm:54
 msgid "Pitch accent dictionary from Wadoku."
 msgstr ""
 #: tools/list.scm:32
 #: tools/list.scm:33
 msgid ""
 "Radical to Kanji dictionary from the Electronic Dictionary Research and "
 "Development Group."
 msgid "Tap on the search button and you'll see the results. Easy, right?"
 msgstr ""
 #: tools/list.scm:77
 msgid ""
 "Tatoeba is a collection of sentences and translations. This\n"
 "        dictionary contains pairs of sentences that are direct translations "
 "of\n"
 "        one another, which allows you to see example sentences in search\n"
 "        results."
 msgstr ""
 #: pages/e404.scm:27
 msgid "That's a 404 :/"
 msgstr ""
 "In the following sections we will see how to use them."
 msgstr ""
 #: tools/list.scm:55
 #: tools/list.scm:56
 msgid ""
 "This dictionary allows you to augment search results on the main view\n"
 "         with pitch accent (pronunciation) information.  Japanese is not "
 "         words better, with a standard Japanese pitch accent."
 msgstr ""
 #: tools/list.scm:63
 #: tools/list.scm:64
 msgid ""
 "This dictionary allows you to do searches on the main view of this app.\n"
 "\tFailing to download one of these dictionaries will make the app unusable\n"
 "\tby kanji, reading (kana) and by French translation."
 msgstr ""
 #: tools/list.scm:47
 #: tools/list.scm:48
 msgid ""
 "This dictionary allows you to do searches on the main view of this app.\n"
 "        Failing to download one of these dictionaries will make the app "
 "        by kanji, reading (kana) and by German translation."
 msgstr ""
 #: tools/list.scm:80
 #: tools/list.scm:94
 msgid ""
 "This dictionary allows you to do searches on the main view of this app.\n"
 "        Failing to download one of these dictionaries will make the app "
 "        kanji, reading (kana) and by meaning in the languages you selected."
 msgstr ""
 #: tools/list.scm:34
 #: tools/list.scm:35
 msgid ""
 "This dictionary allows you to enter kanji by selecting some of its\n"
 "    components.  Tap the water component button on the bottom of the screen "
 "    access the kanji selection by component view"
 msgstr ""
 #: tools/list.scm:92
 #: tools/list.scm:106
 msgid ""
 "This dictionary allows you to search for kanji and view kanji information\n"
 "        such as number of strokes, pronunciations and meanings."
 msgstr ""
 #: tools/list.scm:41
 #: tools/list.scm:42
 msgid ""
 "This dictionary allows you to see how a kanji is written, what it is\n"
 "composed of, and the order in which strokes are written."

tatoeba.mk unknown status 1

 TATOEBA_LANGS=eng fra rus spa ukr
 DICOS+=$(addprefix dicos/tatoeba_, $(addsuffix .nani, $(TATOEBA_LANGS)))
 TATOEBA_DOWNLOADS+=$(addprefix dictionaries/tatoeba_, $(addsuffix .csv, sentences_detailed sentences_base tags user_languages))
 DOWNLOADS+=$(TATOEBA_DOWNLOADS)
 .PRECIOUS: dictionaries/tatoeba%.csv
 dictionaries/tatoeba%.csv:
 	wget https://downloads.tatoeba.org/exports/$$(basename $@ .csv | cut -c9-).tar.bz2 -O $@.tar.bz2 --continue
 	tar xf $@.tar.bz2 -C dictionaries
 	mv dictionaries/$$(basename $@ | cut -c9-) $@
 dicos/tatoeba_%.nani: $(TATOEBA_DOWNLOADS) $(TATOEBA_MODULES) $(RADK_MODULES)
 	guile -L modules tools/tatoeba.scm $(shell basename $@ .nani | sed 's|^tatoeba_||g') $@

tools/list.scm

 (use-modules (nani kanji kanjivg))
 (use-modules (nani result result))
 (use-modules (nani pitch pitch))
 (use-modules (nani sentence sentence))
 (use-modules (gcrypt hash))
 (use-modules (ice-9 match))
 (use-modules (ice-9 format))
 	as you can't search for anything.  This dictionary can be searched for
 	by kanji, reading (kana) and by French translation."))
   (define (tatoeba-synopsis lang)
     (match lang
       ("eng" `(_ "Japanese/english aligned sentences from the Tatoeba project."))
       ("fra" `(_ "Japanese/French aligned sentences from the Tatoeba project."))
       ("rus" `(_ "Japanese/Russian aligned sentences from the Tatoeba project."))
       ("spa" `(_ "Japanese/Spanish aligned sentences from the Tatoeba project."))
       ("ukr" `(_ "Japanese/Ukrainian aligned sentences from the Tatoeba project."))))
   (define (tatoeba-description lang)
     `(_ "Tatoeba is a collection of sentences and translations. This
         dictionary contains pairs of sentences that are direct translations of
         one another, which allows you to see example sentences in search
         results."))
   (define (jmdict-synopsis lang)
     (match lang
       ("e" `(_ "Japanese/English dictionary from the Electronic Dictionary Research and Development Group."))
              (let ((dico-lang (substring dico 7)))
                (if long?
                    (jmdict-description dico-lang)
                    (jmdict-synopsis dico-lang))))))
                    (jmdict-synopsis dico-lang))))
             ((equal? (dico-type dico) "tatoeba")
              (let ((dico-lang (substring dico 8)))
                (if long?
                    (tatoeba-description dico-lang)
                    (tatoeba-synopsis dico-lang))))))
          (translated (translate english lang)))
     (if (and (equal? english translated) (not (equal? lang "en")))
         #f
      "kanjidic")
     ((and (> (string-length file) 6) (equal? (substring file 0 6) "JMdict"))
      "jmdict")
     ((and (> (string-length file) 7) (equal? (substring file 0 7) "tatoeba"))
      "tatoeba")
     ((equal? file "jibiki_fre") "jibiki")
     ((equal? file "wadoku_ger") "wadoku")
     ((equal? file "wadoku_pitch") "wadoku_pitch")))
      (kanjidic-entry-count file))
     ((member (dico-type (dico-name file)) '("jmdict" "wadoku" "jibiki"))
      (dictionary-entry-count file))
     ((equal? (dico-type (dico-name file)) "tatoeba")
      (sentence-dictionary-entry-count file))
     ((equal? (dico-type (dico-name file)) "wadoku_pitch")
      (pitch-entry-count file))))

tools/tatoeba.scm unknown status 1

 ;;; Nani Project website
 ;;; Copyright ?? 2022 Julien Lepiller <julien@lepiller.eu>
 ;;;
 ;;; This file is part of the Nani Project website.
 ;;;
 ;;; The Nani Project website is free software; you can redistribute it and/or modify it
 ;;; under the terms of the GNU Affero General Public License as published by
 ;;; the Free Software Foundation; either version 3 of the License, or (at
 ;;; your option) any later version.
 ;;;
 ;;; The Nani Project website is distributed in the hope that it will be useful, but
 ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ;;; GNU Affero General Public License for more details.
 ;;;
 ;;; You should have received a copy of the GNU Affero General Public License
 ;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
 (use-modules (srfi srfi-9))
 (use-modules (srfi srfi-11))
 (use-modules (ice-9 match))
 (use-modules (ice-9 binary-ports))
 (use-modules (ice-9 textual-ports))
 (use-modules (nani sentence sentence))
 (define (tatoeba-file file)
   (string-append (dirname (current-filename)) "/../dictionaries/tatoeba_" file))
 (define (get-sentences lang native-users)
   (call-with-input-file (tatoeba-file "sentences_detailed.csv")
     (lambda (port)
       (let loop ((jpn '()) (trans '()) (line (get-line port)))
         (if (eof-object? line)
           (values (reverse jpn) (reverse trans))
           (match (string-split line #\tab)
             ((id slang text user _ _)
              (cond
                ((equal? lang slang)
                 (loop jpn (cons (cons (string->number id) text) trans)
                       (get-line port)))
                ((and (equal? slang "jpn") (member user native-users))
                 (loop (cons (cons (string->number id) text) jpn) trans (get-line port)))
                (else
                  (loop jpn trans (get-line port)))))))))))
 (define (get-native-jpn-users)
   (call-with-input-file (tatoeba-file "user_languages.csv")
     (lambda (port)
       (let loop ((jpn '()) (line (get-line port)))
         (if (eof-object? line)
           jpn
           (match (string-split line #\tab)
             ((ulang level user _)
              ;; consume the comment if it's on multiple lines
              (let loop2 ((line line))
                (when (string-suffix? "\\" line)
                  (loop2 (get-line port))))
              ;; Add native japanese user
              (if (and (equal? ulang "jpn") (equal? level "5"))
                (loop (cons user jpn) (get-line port))
                (loop jpn (get-line port))))))))))
 (define (get-translations jpn trans)
   (define jpn-len (vector-length jpn))
   (define trans-len (vector-length trans))
   (define (member-vect elem vect len)
     (let loop ((min 0) (max (- len 1)))
       (if (> min max)
         #f
         (let* ((mid-pos (floor (+ min (/ (- max min) 2))))
                (mid-elem (vector-ref vect mid-pos)))
           (cond
             ((equal? mid-elem elem) #t)
             ((> mid-elem elem) (loop min (- mid-pos 1)))
             ((< mid-elem elem) (loop (+ mid-pos 1) max)))))))
   (call-with-input-file (tatoeba-file "sentences_base.csv")
     (lambda (port)
       (let loop ((translations '()) (line (get-line port)))
         (if (or (eof-object? line) (null? trans))
           translations
           (match (string-split line #\tab)
             ((id translation)
              (cond
                ((and (string->number id)
                      (member-vect (string->number id) trans trans-len)
                      (string->number translation)
                      (member-vect (string->number translation) jpn jpn-len))
                 (loop (cons (cons (string->number translation) (string->number id)) translations)
                       (get-line port)))
                ((and (string->number id)
                      (member-vect (string->number id) jpn jpn-len)
                      (string->number translation)
                      (member-vect (string->number translation) trans trans-len))
                 (loop (cons (cons (string->number id) (string->number translation)) translations)
                       (get-line port)))
                (else
                  (loop translations (get-line port)))))))))))
 (define (add-tags translations trans jpn)
   (define tags
     (call-with-input-file (tatoeba-file "tags.csv")
       (lambda (port)
         (let loop ((tags '()) (line (get-line port)))
           (if (eof-object? line)
             tags
             (match (string-split line #\tab)
               ((id tag)
                (assoc-set! tags (string->number id) (cons tag (or (assoc-ref tags (string->number id)) '()))))))))))
   (map
     (match-lambda
       ((jpn-id . trans-id)
        (let ((tags (or (assoc-ref tags jpn-id) '()))
              (trans (assoc-ref trans trans-id))
              (jpn (assoc-ref jpn jpn-id)))
          (make-sentence jpn trans tags #f))))
     translations))
 (define (get-tatoeba-sentences lang)
   (define native-users (get-native-jpn-users))
   (let-values (((jpn trans) (get-sentences lang native-users)))
     (format #t "jpn: ~a sentences~%" (length jpn))
     (format #t "~a: ~a sentences~%" lang (length trans))
     (let ((translations
             (get-translations
               (list->vector (sort (map car jpn) <))
               (list->vector (sort (map car trans) <)))))
       (format #t "~a pairs~%" (length translations))
       (add-tags translations trans jpn))))
 (match (command-line)
   ((_ lang output)
    (let ((sentences (get-tatoeba-sentences lang)))
      (format #t "Number of entries in ~a: ~a~%" output (length sentences))
      (call-with-output-file output
        (lambda (port)
          (put-bytevector port
            (serialize-sentence-dictionary sentences)))))))