Add wadoku dictionary modules

Julien LepillerWed Jun 03 20:44:26+0200 2020

96f835b

Add wadoku dictionary modules

modules/nani/parse-result.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani parse-result)
20+
  #:use-module (ice-9 binary-ports)
21+
  #:use-module (rnrs bytevectors)
22+
  #:export (parse-result-list
23+
            parse-result-char
24+
            parse-result-int
25+
            parse-result-boolean
26+
            parse-result-string))
27+
28+
(define* (parse-result-list port parse-result-element)
29+
  (let ((size (bytevector-u16-ref (get-bytevector-n port 2) 0 (endianness big))))
30+
    (let loop ((result '()) (remaining size))
31+
      (if (= remaining 0)
32+
          (reverse result)
33+
          (loop (cons (parse-result-element port) result) (- remaining 1))))))
34+
35+
(define (parse-result-char port)
36+
  (get-u8 port))
37+
38+
(define (parse-result-int port)
39+
  (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big)))
40+
41+
(define (parse-result-boolean port)
42+
  (= (get-u8 port) 1))
43+
44+
(define (parse-result-string port)
45+
  (define (get-result-string port)
46+
    (let loop ((lu8 '()) (char (get-u8 port)))
47+
      (if (= char 0)
48+
          lu8
49+
          (loop (cons char lu8) (get-u8 port)))))
50+
  (utf8->string (u8-list->bytevector (reverse (get-result-string port)))))

modules/nani/wadoku/entities.scm unknown status 1

1+
(define-module (nani wadoku entities)
2+
  #:use-module (nani result)
3+
  #:use-module ((nani tags) #:prefix tag:)
4+
  #:use-module (ice-9 match)
5+
  #:export (get-tag hint->tag)
6+
  #:re-export (tag:get-points))
7+
8+
(define wadoku-entities
9+
  `(("Aerodyn." . "aero")
10+
    ("Agrochem." . "chem")
11+
    ("??gypt. Gesch." . "hist")
12+
    ("??gypt. Mythol." . "rel")
13+
    ("Aktien" . "econ")
14+
    ("Akupunktur" . "med")
15+
    ("Akustik" . "music")
16+
    ("American Football" . "sports")
17+
    ("amerik. Gesch." . "hist")
18+
    ("amerik. Pol." . "politics")
19+
    ("amerik. Rechtsw." . "law")
20+
    ("Anat." . "anat")
21+
    ("Angeln" . "sports")
22+
    ("Anime" . "work")
23+
    ("Anthropol." . "antho")
24+
    ("arab. Gesch." . "hist")
25+
    ("Arch??ol." . "archeo")
26+
    ("Archit." . "archit")
27+
    ("Astrol." . "astron")
28+
    ("Astron." . "astron")
29+
    ("atmosph??r. Optik" . "physics")
30+
    ("Atomphys." . "physics")
31+
    ("Audio" . "music")
32+
    ("Augenheilk." . "med")
33+
    ("Autorennen" . "sports")
34+
    ("Backw." . "food")
35+
    ("Badminton" . "sports")
36+
    ("Bahnhofsn." . "station")
37+
    ("Bakteriol." . "biol")
38+
    ("Ballet" . "music")
39+
    ("Ballett" . "music")
40+
    ("Ballsport" . "sports")
41+
    ("Bankw." . "finc")
42+
    ("Baseb." . "baseb")
43+
    ("Baseball" . "baseb")
44+
    ("Basketball" . "sports")
45+
    ("Bauingenieurw." . "archit")
46+
    ("Bautechnik" . "archit")
47+
    ("Bauw." . "archit")
48+
    ("Befestigungsw." . "archit")
49+
    ("Beh??rde" . "law")
50+
    ("Bergb." . "archit")
51+
    ("Bergbau" . "archit")
52+
    ("Bergn." . "place")
53+
    ("Bergsteigen" . "sports")
54+
    ("Bibel" . "Christn")
55+
    ("Bibliotheksw." . "biblio")
56+
    ("Bildhauerei" . "work")
57+
    ("bild. Kunst" . "work")
58+
    ("Billard" . "game")
59+
    ("Biochem." . "biol")
60+
    ("Biol." . "biol")
61+
    ("Bodenkunde" . "agri")
62+
    ("Bogenschie??en" . "sports")
63+
    ("B??rse" . "econ")
64+
    ("Bot." . "bot")
65+
    ("Bowling" . "sports")
66+
    ("Boxen" . "sports")
67+
    ("Brauereiw." . "alco")
68+
    ("Brettspiel" . "game")
69+
    ("brit. Gesch." . "hist")
70+
    ("brit. Rechtsw." . "law")
71+
    ("Br??ckenbau" . "engr")
72+
    ("Bsp." . "exp")
73+
    ("Buchbinden" . "print")
74+
    ("B??chereiw." . "biblio")
75+
    ("Buchtitel" . "work")
76+
    ("Buchw." . "biblio")
77+
    ("Buddh." . "Buddh")
78+
    ("Bud??" . "MA")
79+
    ("B??d??" . "MA")
80+
    ("Chem." . "chem")
81+
    ("chin. Arch??ol." . "archeo")
82+
    ("chin. Astron." . "astron")
83+
    ("chin. Gesch." . "hist")
84+
    ("chin. Gramm." . "ling")
85+
    ("chin. Literaturw." . "work")
86+
    ("chin. Mus." . "music")
87+
    ("chin. Musikinstr." . "music")
88+
    ("chin. Mythol." . "rel")
89+
    ("chin. Philos." . "rel")
90+
    ("chin. Phon." . "ling")
91+
    ("chin. Poet." . "poet")
92+
    ("chin. Pol." . "politics")
93+
    ("chin. Theater" . "work")
94+
    ("Chirurgie" . "med")
95+
    ("Christent." . "Christn")
96+
    ("Cocktail" . "alco")
97+
    ("Computerling." . "comp")
98+
    ("Curling" . "sports")
99+
    ("Dammbau" . "engr")
100+
    ("Demogr." . "demo")
101+
    ("Diplomat." . "politics")
102+
    ("Druckw." . "print")
103+
    ("dtsch. Gesch." . "hist")
104+
    ("dtsch. Gramm." . "ling")
105+
    ("EDV" . "comp")
106+
    ("Einzel-Kanji" . "ling")
107+
    ("Streckenkilometertafel" . "train")
108+
    ("Eishockey" . "sports")
109+
    ("Eiskunstl." . "sports")
110+
    ("Eiskunstlauf" . "sports")
111+
    ("Eisschnelllauf" . "sports")
112+
    ("Elektrochem." . "chem")
113+
    ("Elektrot." . "engr")
114+
    ("Embryol." . "biol")
115+
    ("engl. Gesch." . "hist")
116+
    ("Entwicklungsphysiol." . "biol")
117+
    ("Entwicklungspsychol." . "psy")
118+
    ("Ergonomie" . "engr")
119+
    ("Ethik" . "philo")
120+
    ("Ethnol." . "ethno")
121+
    ("europ. Gesch." . "hist")
122+
    ("Fahrrad" . "sports")
123+
    ("Familienn." . "surname")
124+
    ("Familienn.." . "surname")
125+
    ("Farbe" . "color")
126+
    ("F??rberei" . "build")
127+
    ("Fechten" . "build")
128+
    ("Fertigungstechnik" . "engr")
129+
    ("Film" . "work")
130+
    ("Filmtitel" . "work")
131+
    ("Finanzw." . "finc")
132+
    ("Firmenn." . "company")
133+
    ("Fischerei" . "sports")
134+
    ("Fischk." . "food")
135+
    ("Flugw." . "engr")
136+
    ("Flussn." . "place")
137+
    ("Forstw." . "bot")
138+
    ("Fotog." . "photo")
139+
    ("Frachtw." . "trans")
140+
    ("franz. Gesch." . "hist")
141+
    ("Frisur" . "fashion")
142+
    ("Funkt." . "physics")
143+
    ("Fu??ball" . "sports")
144+
145+
    ("Gagaku" . "music")
146+
    ("Garten" . "bota")
147+
    ("Gartenk." . "bota")
148+
    ("Gateball" . "sports")
149+
    ("Gebietsn." . "place")
150+
    ("Geldw." . "finc")
151+
    ("Genetik" . "biol")
152+
    ("Geogr." . "place")
153+
    ("Geol." . "geol")
154+
    ("Geom." . "geom")
155+
    ("Geophys." . "physics")
156+
    ("Gerberei" . "build")
157+
    ("Gesch." . "hist")
158+
    ("Gewichtheben" . "sports")
159+
    ("Glasherst." . "build")
160+
    ("Glaziol." . "sci")
161+
    ("Go" . "game")
162+
    ("Golf" . "sports")
163+
    ("Gramm." . "ling")
164+
    ("griech. Christent." . "Christn")
165+
    ("griech. Gesch." . "hist")
166+
    ("griech. Myth." . "rel")
167+
    ("griech. Mythol." . "rel")
168+
    ("griech. Theat." . "work")
169+
    ("Gymnastik" . "sports")
170+
    ("Handball" . "sports")
171+
    ("Handel" . "econ")
172+
    ("Handwerkskunst" . "build")
173+
    ("Heraldik" . "sci")
174+
    ("Hindu." . "rel")
175+
    ("Hochsprung" . "sports")
176+
    ("Hockey" . "sports")
177+
    ("Holzbau" . "engr")
178+
    ("Holzbearb." . "engr")
179+
    ("Holzschnitt" . "engr")
180+
    ("Hunderasse" . "zool")
181+
    ("Hydrodynamik" . "physics")
182+
    ("I Ging" . "rel")
183+
    ("Ikebana" . "build")
184+
    ("Immunol." . "med")
185+
    ("ind. Gesch." . "hist")
186+
    ("indian. Mythol." . "rel")
187+
    ("ind. Mythol." . "rel")
188+
    ("indones. Gesch." . "hist")
189+
    ("ind. Philos." . "philo")
190+
    ("Insektenk." . "zool")
191+
    ("Inseln." . "place")
192+
    ("Internet" . "comp")
193+
    ("Islam" . "rel")
194+
    ("islam. Mythol." . "rel")
195+
    ("islam. Rechtsw." . "law")
196+
    ("islam. Rel." . "rel")
197+
    ("israel. Gesch." . "hist")
198+
    ("ital. Gesch." . "hist")
199+
    ("japan. Archit." . "archit")
200+
    ("japan. Astrologie" . "aston")
201+
    ("japan. bild. Kunst" . "work")
202+
    ("japan. Christent." . "Christn")
203+
    ("japan. Gesch." . "hist")
204+
    ("japan. Gramm." . "ling")
205+
    ("japan. Literaturw." . "work")
206+
    ("japan. Med." . "med")
207+
    ("japan. Mus." . "music")
208+
    ("japan. Mythol." . "rel")
209+
    ("japan. Philos." . "philo")
210+
    ("japan. Pol." . "politics")
211+
    ("japan. Schwimmen" . "sports")
212+
    ("japan. Soziol." . "sci")
213+
    ("japan. Sprachw." . "ling")
214+
    ("J??ruri" . "music")
215+
    ("Judent." . "rel")
216+
    ("J??d??" . "MA")
217+
    ("Kabuki" . "music")
218+
    ("Kagura" . "music")
219+
    ("Kalligraphie" . "build")
220+
    ("Kanbun" . "ling")
221+
    ("Kanji" . "ling")
222+
    ("Kanp??" . "med")
223+
    ("Karten" . "games")
224+
    ("Kartenspiel" . "game")
225+
    ("Kartogr." . "place")
226+
    ("kath. Christent." . "Christn")
227+
    ("Kend??" . "MA")
228+
    ("Keramik" . "build")
229+
    ("Kernphys." . "physics")
230+
    ("Kfz-W." . "engr")
231+
    ("KI" . "comp")
232+
    ("Kleidung" . "fashion")
233+
    ("Kochk." . "food")
234+
    ("Konditoreiw." . "food")
235+
    ("Konfuz." . "rel")
236+
    ("korean. Gesch." . "hist")
237+
    ("korean. Rel." . "rel")
238+
    ("korean. Theater" . "work")
239+
    ("Kosmetik" . "fashion")
240+
    ("Kricket" . "sports")
241+
    ("Kristallogr." . "physics")
242+
    ("Krocket" . "sports")
243+
    ("Kunst" . "work")
244+
    ("Kunstgesch." . "hist")
245+
    ("Kunst??? und Turmsprung" . "build")
246+
    ("Ky??gen" . "music")
247+
    ("Lack" . "work")
248+
    ("L??ndern." . "place")
249+
    ("Landw." . "agri")
250+
    ("Leichtathl." . "sports")
251+
    ("Liedtitel" . "work")
252+
    ("Literaturw." . "work")
253+
    ("Logik" . "math")
254+
    ("Luftf." . "aero")
255+
    ("Machinenb." . "engr")
256+
    ("Mah-Jongg" . "mahj")
257+
    ("Management" . "bus")
258+
    ("Manga" . "work")
259+
    ("m??nnl. Name" . "given")
260+
    ("m??nnl. Vorn." . "given")
261+
    ("m??nnl. Vorname" . "given")
262+
    ("Mantik" . "rel")
263+
    ("Marine" . "sail")
264+
    ("Marketing" . "bus")
265+
    ("Maschinenb." . "engr")
266+
    ("Ma??" . "engr")
267+
    ("Math." . "math")
268+
    ("Mech." . "physics")
269+
    ("Med." . "med")
270+
    ("Meeresk." . "place")
271+
    ("Meeresn." . "place")
272+
    ("Meinungsforschung" . "politics")
273+
    ("Messw." . "hobby")
274+
    ("Metall." . "engr")
275+
    ("Metallbearb." . "engr")
276+
    ("Metallguss" . "engr")
277+
    ("Meteor." . "meteor")
278+
    ("mexikan. Kochk." . "food")
279+
    ("Mikrobiol." . "biol")
280+
    ("Milit." . "mil")
281+
    ("Mineral." . "geol")
282+
    ("M??belbau" . "build")
283+
    ("Mode" . "fashion")
284+
    ("Motorsport" . "sports")
285+
    ("Mus." . "music")
286+
    ("Muschelk." . "zool")
287+
    ("Musikinstr." . "music")
288+
    ("Mykol." . "bota")
289+
    ("Mythol." . "rel")
290+
    ("Nachrichtent." . "politics")
291+
    ("N??hen" . "fashion")
292+
    ("Naniwabushi" . "music")
293+
    ("Naturmed." . "med")
294+
    ("Naturph??n." . "physics")
295+
    ("Naturphilos." . "philo")
296+
    ("Neng??" . "hist")
297+
    ("Netzwerktechnik" . "comp")
298+
    ("Neurol." . "med")
299+
    ("N??" . "music")
300+
    ("nord. Mythol." . "rel")
301+
    ("Numismatik" . "build")
302+
    ("??kol." . "biol")
303+
    ("Oper" . "music")
304+
    ("Optik" . "physics")
305+
    ("Org." . "organization")
306+
    ("Origami" . "build")
307+
    ("Ortsn." . "place")
308+
    ("P??d." . "school")
309+
    ("Pal??ontol." . "hist")
310+
    ("Papierherst." . "build")
311+
    ("Parapsych." . "psy")
312+
    ("Patentw." . "engr")
313+
    ("Pathol." . "med")
314+
    ("pers. Gesch." . "hist")
315+
    ("Pers??nlichk." . "person")
316+
    ("Petrochem." . "chem")
317+
    ("Pferderennen" . "sports")
318+
    ("Pferdesport" . "sports")
319+
    ("Pharm." . "med")
320+
    ("Philos." . "philo")
321+
    ("philos. Anthrop." . "anth")
322+
    ("Phon." . "ling")
323+
    ("Phys." . "physics")
324+
    ("Physiol." . "anat")
325+
    ("Poetik" . "poet")
326+
    ("Pol." . "politics")
327+
    ("Postw." . "telecom")
328+
    ("Psych." . "psy")
329+
    ("Quantenphys." . "physics")
330+
    ("Radio" . "telecom")
331+
    ("Radrennen" . "sports")
332+
    ("Radsport" . "sports")
333+
    ("Rakugo" . "music")
334+
    ("Raumf." . "engr")
335+
    ("Reaktort." . "engr")
336+
    ("Rechnungsw." . "bus")
337+
    ("Rechtsw." . "law")
338+
    ("Redew." . "exp")
339+
    ("Regelungstechnik" . "engr")
340+
    ("Reiten" . "sports")
341+
    ("Rel." . "rel")
342+
    ("Rennsport" . "sports")
343+
    ("Rhetorik" . "philo")
344+
    ("Ringen" . "sports")
345+
    ("r??m. Gesch." . "hist")
346+
    ("r??m. Mythol." . "rel")
347+
    ("Rudern" . "sports")
348+
    ("Rugby" . "sports")
349+
    ("Rundfunk" . "telecom")
350+
    ("russ. Gesch." . "hist")
351+
    ("R??stung" . "mil")
352+
    ("Schach" . "game")
353+
    ("Schie??sport" . "sports")
354+
    ("Schiff" . "sail")
355+
    ("Schiffbau" . "sail")
356+
    ("Schifffahrt" . "sail")
357+
    ("Schmuck" . "build")
358+
    ("Schneckenk." . "zool")
359+
    ("Schneiderei" . "fashion")
360+
    ("Schuhe" . "fashion")
361+
    ("Schule" . "school")
362+
    ("Schwei??en" . "build")
363+
    ("Schwert" . "mil")
364+
    ("Schwimmen" . "sports")
365+
    ("Seef." . "sail")
366+
    ("Seen." . "place")
367+
    ("Seerechtsw." . "law")
368+
    ("Seew." . "sail")
369+
    ("Segeln" . "sail")
370+
    ("Seide" . "build")
371+
    ("Seismol." . "physics")
372+
    ("SF" . "work")
373+
    ("Shingon" . "Buddh")
374+
    ("Shint??" . "Shinto")
375+
    ("Sh??gi" . "shogi")
376+
    ("Ski" . "sports")
377+
    ("Skispringen" . "sports")
378+
    ("Softball" . "sports")
379+
    ("Sozialpsych." . "psy")
380+
    ("Soziol." . "sci")
381+
    ("span. Gesch." . "hist")
382+
    ("Spiel" . "game")
383+
    ("Spiele-Titel" . "game")
384+
    ("Spieltheorie" . "game")
385+
    ("Spinnen" . "zool")
386+
    ("Spinnenk." . "zool")
387+
    ("Sport" . "sports")
388+
    ("Sprache" . "ling")
389+
    ("Sprachw." . "ling")
390+
    ("Sprichw." . "ling")
391+
    ("Stadtn." . "place")
392+
    ("Stadtplanung" . "engr")
393+
    ("Statistik" . "math")
394+
    ("Steuerw." . "econ")
395+
    ("Sticken" . "build")
396+
    ("Stra??enbau" . "engr")
397+
    ("Stricken" . "build")
398+
    ("Strukturalismus" . "philo")
399+
    ("Sum??" . "sumo")
400+
    ("Surfen" . "sports")
401+
    ("Systemanalyse" . "engr")
402+
    ("Systemtheorie" . "engr")
403+
    ("taiwan. Pol." . "politics")
404+
    ("Tanz" . "art")
405+
    ("Tanzen" . "art")
406+
    ("Taoismus" . "rel")
407+
    ("Tauchen" . "sports")
408+
    ("Technik" . "engr")
409+
    ("Tee" . "hobby")
410+
    ("Telegrafie" . "telecom")
411+
    ("Telekom." . "telecom")
412+
    ("Tempeln." . "place")
413+
    ("Tennis" . "sports")
414+
    ("Tenn??" . "surname")
415+
    ("Textilt." . "engr")
416+
    ("Theat." . "work")
417+
    ("Theol." . "rel")
418+
    ("tibet. Buddh." . "Buddh")
419+
    ("Tiefb." . "engr")
420+
    ("Tiermed." . "med")
421+
    ("Tischtennis" . "sports")
422+
    ("Tourismus" . "hobby")
423+
    ("Tunnelbau" . "engr")
424+
    ("t??rk. Gesch." . "hist")
425+
    ("Turnen" . "sports")
426+
    ("TV" . "telecom")
427+
    ("TV-Prog." . "telecom")
428+
    ("U-Bahn" . "train")
429+
    ("Umwelt" . "biol")
430+
    ("Univ.-N." . "place")
431+
    ("Verhaltensbiol." . "biol")
432+
    ("Verkehrsw." . "engr")
433+
    ("Verlagsn." . "company")
434+
    ("Verlagsw." . "econ")
435+
    ("Versicherungsw." . "bus")
436+
    ("Video" . "telecom")
437+
    ("Videospiel" . "game")
438+
    ("vietnam. Gesch." . "hist")
439+
    ("Vogelk." . "zool")
440+
    ("V??lkerk." . "ethno")
441+
    ("V??lkerr." . "law")
442+
    ("Volleyball" . "sports")
443+
    ("Vorgesch." . "hist")
444+
    ("VWL" . "econ")
445+
    ("Waffenk." . "mil")
446+
    ("Walfang" . "hobby")
447+
    ("Wasserball" . "sports")
448+
    ("Wasserbau" . "engr")
449+
    ("Wassersport" . "sports")
450+
    ("Weben" . "build")
451+
    ("weibl. Name" . "given")
452+
    ("weibl. Vorn." . "given")
453+
    ("Weichtierk." . "zool")
454+
    ("Wein" . "alco")
455+
    ("Werbung" . "bus")
456+
    ("Werktitel" . "work")
457+
    ("Wintersport" . "sports")
458+
    ("Wirtsch." . "econ")
459+
    ("Wrestling" . "sports")
460+
    ("Wz." . "exp")
461+
    ("Yoga" . "sports")
462+
    ("Z??hlw." . "finc")
463+
    ("Zahnmed." . "med")
464+
    ("Zeitschriftenn." . "work")
465+
    ("Zeitungsn." . "work")
466+
    ("Zeitungsw." . "print")
467+
    ("Zellbiol." . "biol")
468+
    ("Zen" . "rel")
469+
    ("Zirkus" . "hobby")
470+
    ("Zitat" . "exp")
471+
    ("Zollw." . "bus")
472+
    ("Zool." . "biol")))
473+
474+
(define (hint->tag hint)
475+
  (match hint
476+
    ("Kansai-Dial." "ksb")
477+
    ("Ky??to-Dial." "kyb")
478+
    ("??saka-Dial." "osb")
479+
    ("Kant??-Dial." "ktb")
480+
    ("T??hoku-Dial." "thb")
481+
    ("Tsugaru-Dial." "tsug")
482+
    ("Ky??sh??-Dial." "kyu")
483+
    ("Hokkaid??-Dial." "hob")
484+
    ("altert??ml." "arch")
485+
    ("arch." "arch")
486+
    ("Edo-zeitl." "arch")
487+
    ("ehrerb.-h??fl." "hon")
488+
    ("h??fl.-ehrerb." "hon")
489+
    ("besch.-h??fl." "hum")
490+
    ("abwertend" "derog")
491+
    ("etwas altmodisch" "obs")
492+
    ("Gaunerjargon" "thief")
493+
    ("Frauenspr. der Edo-Zeit" "arch")
494+
    ("Geheimspr. der Gauner" "thief")
495+
    ("hist." "hist")
496+
    ("h??fl." "pol")
497+
    ("kinderspr." "chn")
498+
    ("M??nnersprache" "male")
499+
    ("Medizinerjargon" "med")
500+
    ("Milit??rjargon" "mil")
501+
    ("obsol." "obs")
502+
    ("onomat." "on-mim")
503+
    ("poet." "poet")
504+
    ("Polizeijargon" "law")
505+
    ("Slang" "sl")
506+
    ("veraltet" "arch")
507+
    (_ #f)))
508+
509+
(define (wadoku->tag-name wadoku-tag)
510+
  (let ((tag (assoc-ref wadoku-entities wadoku-tag)))
511+
    (or tag wadoku-tag)))
512+
513+
(define (get-tag tag)
514+
  (tag:get-tag (wadoku->tag-name tag)))

modules/nani/wadoku/xml.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani wadoku xml)
20+
  #:use-module (ice-9 match)
21+
  #:use-module (ice-9 rdelim)
22+
  #:use-module (nani frequency)
23+
  #:use-module (nani result)
24+
  #:use-module (nani wadoku entities)
25+
  #:use-module (srfi srfi-9)
26+
  #:use-module (sxml ssax)
27+
  #:export (xml->results))
28+
29+
(define-record-type usage
30+
  (make-usage type reg content)
31+
  usage?
32+
  (type    usage-type)
33+
  (reg     usage-reg)
34+
  (content usage-content))
35+
36+
(define (flatten lst)
37+
  (match lst
38+
    (() '())
39+
    ((head lst ...)
40+
     (if (list? head)
41+
         (append (flatten head) (flatten lst))
42+
         (cons head (flatten lst))))
43+
    (lst (list lst))))
44+
45+
(define* (update-usage usage #:key (type    (usage-type    usage))
46+
                                   (reg     (usage-reg     usage))
47+
                                   (content (usage-content usage)))
48+
  (make-usage type reg content))
49+
50+
;; TODO
51+
(define (usage->tag usage)
52+
  (let ((content (usage-content usage))
53+
        (type (usage-type usage))
54+
        (reg (usage-reg usage)))
55+
    `(,@(if (string-null? content)
56+
            (match type
57+
              ("" '()))
58+
            (match type
59+
              ("dom" `((tag ,content)))
60+
              ("hint" (let ((tag (hint->tag content)))
61+
                        (if tag `(tag ,tag) '())))))
62+
      ,@(match reg
63+
          ("" '())
64+
          ("lit" `((tag "litf")))
65+
          ("kinderspr." `((tag "chn")))
66+
          ("kleinkindspr." `((tag "baby")))
67+
          ("jugendspr." `((tag "young")))
68+
          ("coll" `((tag "col")))
69+
          ("vulg." `((tag "vulg")))
70+
          ("Slang" `((tag "sl")))
71+
          ("poet." `((tag "poet")))
72+
          ("hist." `((tag "hist")))
73+
          ("hist" `((tag "hist")))
74+
          ("obsol." `((tag "obs")))
75+
          ("veraltet" `((tag "arch")))
76+
          ("sch??lerspr." `((tag "stud")))
77+
          ("studentenspr." `((tag "stud")))
78+
          ("frauenspr." `((tag "fem")))
79+
          ("gaunerspr." `((tag "thief")))
80+
          ("geheimspr." `((tag "secret")))
81+
          ("hofdamenspr." `((tag "fem") (tag "pol")))
82+
          ("seemannsspr." `((tag "sail")))))))
83+
84+
(define (usg->tag lst)
85+
  (define (usg->usage lst)
86+
    (let loop ((result (make-usage "" "" "")) (lst lst))
87+
      (if (null? lst)
88+
        result
89+
        (loop
90+
          (match (car lst)
91+
            (('type . type) (update-usage result #:type type))
92+
            (('reg . reg) (update-usage result #:reg reg))
93+
            (((? symbol? s) . v)
94+
             (throw 'unknow-usage-attr s v))
95+
            ((? string? content) (update-usage result #:content content))
96+
            ((? list? l) (loop result l))
97+
            (_ result))
98+
          (cdr lst)))))
99+
  (usage->tag (usg->usage lst)))
100+
101+
(define (ruigo->ref lst)
102+
  (let loop ((ref #f) (lst lst))
103+
    (match lst
104+
      (() ref)
105+
      ((('id id) lst ...)
106+
       (loop id lst))
107+
      (((a . b) lst ...)
108+
       (loop (loop ref (list (list a b))) lst))
109+
      (((? list? l) lst ...)
110+
       (loop (loop ref l) lst)))))
111+
112+
(define (merge-senses s1 s2)
113+
  (update-sense s2
114+
    #:references (append (sense-references s1) (sense-references s2))
115+
    #:tags (append (sense-tags s1) (sense-tags s2))
116+
    #:glosses (append (sense-glosses s1) (sense-glosses s2))))
117+
118+
(define (sxml->sense lst)
119+
  (let loop ((sense (make-sense '() '() '() '() '() '() "ger")) (lst lst))
120+
    (if (null? lst)
121+
      sense
122+
      (loop
123+
        (match (car lst)
124+
          (#f sense)
125+
          ((? sense? s)
126+
           (merge-senses s sense))
127+
          ((? source? s)
128+
           (update-sense sense #:sources (cons s (sense-sources sense))))
129+
          (('ref (? string? r))
130+
           (update-sense sense #:references (cons r (sense-references sense))))
131+
          (('tag (? string? r))
132+
           (update-sense sense #:tags (cons (get-tag r) (sense-tags sense))))
133+
          (('trans (? string? r))
134+
           (update-sense sense #:glosses (cons r (sense-glosses sense))))
135+
          (('related . _) sense)
136+
          (('transcr . _) sense)
137+
          (('pitch . _) sense)
138+
          (((? symbol? s) v)
139+
           (throw 'unknown-symbol s v))
140+
          ((? list? l) (loop sense l))
141+
          ((? string? _) sense))
142+
        (cdr lst)))))
143+
144+
;; TODO
145+
(define (sxml->ref lst)
146+
  (define (sxml->ref-name lst)
147+
    (let loop ((result #f) (lst lst))
148+
      (if (null? lst)
149+
        result
150+
        (loop
151+
          (match (car lst)
152+
            (('id . id) id)
153+
            (((? symbol? s) . _) result)
154+
            (((? symbol? s) _) result)
155+
            ((? list? l) (loop result l))
156+
            (_ result))
157+
          (cdr lst)))))
158+
  (let ((ref (sxml->ref-name lst)))
159+
    (if (string? ref)
160+
        `(ref ,ref)
161+
        (throw 'no-ref ref 'from lst))))
162+
163+
(define (sxml->source lst)
164+
  (let loop ((source (make-source '() #f "" "")) (lst lst))
165+
    (if (null? lst)
166+
      source
167+
      (loop
168+
        (match (car lst)
169+
          (('impli impli) (update-source source #:lang impli))
170+
          (('foreign foreign) (update-source source #:content foreign))
171+
          ((? list? l) (loop source l))
172+
          (_ source))
173+
        (cdr lst)))))
174+
175+
(define (sxml->reading lst)
176+
  (let loop ((reading (make-reading '() '() '())) (lst lst))
177+
    (if (null? lst)
178+
      reading
179+
      (loop
180+
        (match (car lst)
181+
          (('reading r)
182+
           (update-reading reading #:readings (cons r (reading-readings reading))))
183+
          (_ reading))
184+
        (cdr lst)))))
185+
186+
(define (sxml->result sxml frq)
187+
  (define (sxml->result-aux sxml)
188+
    (let loop ((result (make-result 0 0 '() '() '())) (last-source #f) (lst sxml))
189+
      (if (null? lst)
190+
        result
191+
        (match (car lst)
192+
          (('kanji kanji)
193+
           (loop
194+
             (update-result result #:kanjis (cons kanji (result-kanjis result)))
195+
             last-source (cdr lst)))
196+
          ((? reading? reading)
197+
           (loop
198+
             (update-result result #:readings (cons reading (result-readings result)))
199+
             last-source (cdr lst)))
200+
          ((? sense? sense)
201+
           (loop
202+
             (update-result result
203+
               #:senses
204+
               (cons
205+
                 (if last-source
206+
                     (update-sense sense
207+
                       #:sources (cons last-source (sense-sources sense)))
208+
                     sense)
209+
                 (result-senses result)))
210+
             last-source (cdr lst)))
211+
          ((? list? l)
212+
           (loop (loop result last-source l) last-source (cdr lst)))
213+
          (_ (loop result last-source (cdr lst)))))))
214+
  (let* ((result (sxml->result-aux sxml))
215+
         (word (if (null? (result-kanjis result))
216+
                   (car (reading-readings (car (result-readings result))))
217+
                   (car (result-kanjis result))))
218+
         (entity (frequency-entity frq word))
219+
         (sense1 (car (result-senses result)))
220+
         (sense1 (if entity
221+
                     (update-sense sense1
222+
                       #:tags (cons (get-tag entity) (sense-tags sense1)))
223+
                     sense1))
224+
         (senses (cons sense1 (cdr (result-senses result)))))
225+
    (update-result result
226+
      #:points (tag:get-points (update-result result #:senses senses))
227+
      #:senses senses)))
228+
229+
(define (gram-tag lst)
230+
  (map
231+
    (lambda (attr)
232+
      (match attr
233+
        (('suru . suru)
234+
         `(tag ,(string-append "suru-" suru)))
235+
        (('ni . ni)
236+
         `(tag ,(string-append "ni-" ni)))
237+
        (('shiku . shiku)
238+
         `(tag ,(string-append "shiku-" shiku)))
239+
        (('ku . ku)
240+
         `(tag ,(string-append "ku-" ku)))
241+
        (('to . to)
242+
         `(tag ,(string-append "to-" to)))
243+
        (('no . no)
244+
         `(tag ,(string-append "no-" no)))
245+
        (('nari . nari)
246+
         `(tag ,(string-append "nari-" nari)))
247+
        (('taru . taru)
248+
         `(tag ,(string-append "taru-" taru)))
249+
        (('level . level)
250+
         `(tag ,(string-append "level-" level)))
251+
        (('godanrow . tr)
252+
         `(tag ,(string-append "godanrow-" tr)))
253+
        (('onbin . tr)
254+
         `(tag ,(string-append "onbin-" tr)))
255+
        (('transitivity . tr)
256+
         `(tag ,(string-append "transitivity-" tr)))))
257+
    lst))
258+
259+
(define (sxml->string lst)
260+
  (define (sub-loop loop tags result lst l)
261+
    (let ((result (loop tags result l)))
262+
      (if (list? result)
263+
          (loop (append tags (filter list? result))
264+
                (apply string-append (filter string? result))
265+
                lst)
266+
          (loop tags result lst))))
267+
268+
  (let loop ((tags '()) (result "") (lst lst))
269+
    (match lst
270+
      (() (if (null? tags)
271+
              result
272+
              (append tags (list result))))
273+
      ((? string? s)
274+
       (loop tags (string-append result s) '()))
275+
      (((? string? s) lst ...)
276+
       (loop tags (string-append result s) lst))
277+
      ((('prior . _) lst ...)
278+
       (loop tags result lst))
279+
      ((('options . _) lst ...)
280+
       (loop tags result lst))
281+
      ((('firstname . _) lst ...)
282+
       (loop tags result lst))
283+
      ((('ausn . _) lst ...)
284+
       (loop tags result lst))
285+
      ((('lang . _) lst ...)
286+
       (loop tags result lst))
287+
      ((('meta . _) lst ...)
288+
       (loop tags result lst))
289+
      ((('genki . genki) lst ...)
290+
       (loop (cons `(tag ,genki) tags) result lst))
291+
      ((('jlpt . jlpt) lst ...)
292+
       (loop (cons `(tag ,(string-append "jlpt-" jlpt)) tags) result lst))
293+
      ((('ref . (? string? ref)) lst ...)
294+
       (loop (cons `(ref ,ref) tags) result lst))
295+
      ((('jap l) lst ...)
296+
       (sub-loop loop tags result lst l))
297+
      ((('foreign l) lst ...)
298+
       (sub-loop loop tags result lst l))
299+
      ((('transcr l) lst ...)
300+
       (sub-loop loop tags result lst l))
301+
      ((((? symbol? s) v) lst ...)
302+
       (throw 'unsupported-symbol s v))
303+
      ((((? symbol? s) . v) lst ...)
304+
       (throw 'unsupported-symbol s v))
305+
      (((? list? l) lst ...)
306+
       (sub-loop loop tags result lst l)))))
307+
308+
(define (sxml->element lst elem frq)
309+
  (let ((elem (match elem
310+
                ((_ . elem) elem)
311+
                (_ elem))))
312+
    (match elem
313+
      ('orth (let ((kanji (filter string? lst)))
314+
               (if (null? kanji) #f `(kanji ,(car kanji)))))
315+
      ('entry (sxml->result lst frq))
316+
      ('hira `(reading ,(car lst)))
317+
      ('hatsuon `(hatsuon ,(car lst)))
318+
      ('accent `(pitch ,(car lst)))
319+
      ('reading (sxml->reading lst))
320+
      ('form lst)
321+
      ('impli `(impli ,(car lst)))
322+
      ('text (let loop ((text "") (lst lst))
323+
               (match lst
324+
                 (() text)
325+
                 ((('hasPrecedingSpace . _) lst ...)
326+
                  (loop (string-append " " text) lst))
327+
                 ((('hasFollowingSpace . _) lst ...)
328+
                  (string-append (loop text lst) " "))
329+
                 (((? string? s) lst ...)
330+
                  (loop (string-append text s) lst)))))
331+
      ('famn (sxml->string lst))
332+
      ('expl (sxml->string lst))
333+
      ('expli (filter list? lst))
334+
      ('abbrev (filter list? lst))
335+
      ('token (filter string? lst))
336+
      ('tr (sxml->string lst))
337+
      ('transcr `(transcr ,lst))
338+
      ('trans
339+
        (append (filter list? lst)
340+
                (map (lambda (s) `(trans ,s)) (filter string? lst))))
341+
      ('jap `(jap ,lst))
342+
      ('emph lst)
343+
      ('title (filter string? lst))
344+
      ('transl (filter string? lst))
345+
      ('topic (filter string? lst))
346+
      ('iron (filter string? lst))
347+
      ('specchar (filter string? lst))
348+
      ('scientif (filter string? lst))
349+
      ('wikide #f)
350+
      ('wikija #f)
351+
      ('link #f)
352+
      ('ref (sxml->ref lst))
353+
      ('sref (sxml->ref lst))
354+
      ('etym (sxml->source lst))
355+
      ('literal (sxml->string (list "???" (sxml->string lst) "???")))
356+
      ('def (sxml->string (list "(" (sxml->string lst) ")")))
357+
      ('date (sxml->string (list "(" (sxml->string lst) ")")))
358+
      ('birthdeath (sxml->string (list "(" (sxml->string lst) ")")))
359+
      ('descr (sxml->string (list "(" (sxml->string lst) ")")))
360+
      ('bracket (sxml->string (list "[" (sxml->string lst) "]")))
361+
      ('foreign (if (null? lst) #f `(foreign ,(car lst))))
362+
      ('seasonword `(tag ,(string-append "season-" (assoc-ref lst 'type))))
363+
      ('usg (usg->tag lst))
364+
      ('sense (sxml->sense lst))
365+
      ('steinhaus (let ((ref (sxml->string lst)))
366+
                    (if (string? ref)
367+
                        `(ref ,(sxml->string lst))
368+
                        (throw 'not-steinhaus ref))))
369+
      ('pos '()); TODO: actually find what tag to use
370+
      ('wordcomponent (cons `(tag "wordcomponent") (gram-tag lst)))
371+
      ('meishi (cons `(tag "meishi") (gram-tag lst)))
372+
      ('setsuzokushi (cons `(tag "setsuzokushi") (gram-tag lst)))
373+
      ('daimeishi (cons `(tag "daimeishi") (gram-tag lst)))
374+
      ('doushi (cons `(tag "doushi") (gram-tag lst)))
375+
      ('kandoushi (cons `(tag "kandoushi") (gram-tag lst)))
376+
      ('keiyoudoushi (cons `(tag "keiyoudoushi") (gram-tag lst)))
377+
      ('keiyoushi (cons `(tag "keiyoushi") (gram-tag lst)))
378+
      ('fukushi (cons `(tag "fukushi") (gram-tag lst)))
379+
      ('rengo (cons `(tag "rengo") (gram-tag lst)))
380+
      ('suffix (cons `(tag "suffix") (gram-tag lst)))
381+
      ('prefix (cons `(tag "prefix") (gram-tag lst)))
382+
      ('kanji (cons `(tag "kanji") (gram-tag lst)))
383+
      ('rentaishi (cons `(tag "rentaishi") (gram-tag lst)))
384+
      ('specialcharacter (cons `(tag "specialcharacter") (gram-tag lst)))
385+
      ('joshi (cons `(tag "joshi") (gram-tag lst)))
386+
      ('fukujoshi (cons `(tag "fukujoshi") (gram-tag lst)))
387+
      ('kakujoshi (cons `(tag "kakujoshi") (gram-tag lst)))
388+
      ('kakarijoshi (cons `(tag "kakarijoshi") (gram-tag lst)))
389+
      ('shuujoshi (cons `(tag "shuujoshi") (gram-tag lst)))
390+
      ('setsuzokujoshi (cons `(tag "setsuzokujoshi") (gram-tag lst)))
391+
      ('jokeiyoushi (cons `(tag "jokeiyoushi") (gram-tag lst)))
392+
      ('jodoushi (cons `(tag "jodoushi") (gram-tag lst)))
393+
      ('ruigos lst)
394+
      ('ruigo (ruigo->ref lst))
395+
      ('gramGrp lst))))
396+
397+
(define (create-parser frq)
398+
  (ssax:make-parser
399+
    NEW-LEVEL-SEED
400+
    (lambda (elem-gi attributes namespaces expected-content seed)
401+
      attributes)
402+
    
403+
    FINISH-ELEMENT
404+
    (lambda (elem-gi attributes namespaces parent-seed seed)
405+
      (if (equal? elem-gi 'entries)
406+
          seed
407+
          (let* ((seed (reverse seed))
408+
                 (element (sxml->element seed elem-gi frq)))
409+
            (cons element parent-seed))))
410+
    
411+
    CHAR-DATA-HANDLER
412+
    (lambda (string1 string2 seed)
413+
      (cons (string-append string1 string2) seed))))
414+
415+
(define (xml->results port frq)
416+
  (let ((resuts (filter result? ((create-parser frq) port '()))))
417+
    (sort results (lambda (a b) (> (result-points a) (result-points b))))))