Improve furigana matching

Julien LepillerSat Jul 31 14:40:47+0200 2021

5ee3eb5

Improve furigana matching

app/src/main/java/eu/lepiller/nani/result/Result.java

140140
141141
        // split the text into kanji / not kanji portions
142142
        ArrayList<String> portions = new ArrayList<>();
143-
        ArrayList<String> portionsMatcher = new ArrayList<>();
144143
145144
        StringBuilder current = new StringBuilder();
146-
        StringBuilder currentMatcher = new StringBuilder();
147145
        Character.UnicodeBlock b = CJK_UNIFIED_IDEOGRAPHS;
148146
149147
        MojiConverter converter = new MojiConverter();

153151
                // if the headwork contains katakana, convert it to hiragana to match pronunciation
154152
                // better.
155153
                current.append(txt.charAt(i));
156-
                if(b2 == KATAKANA) {
157-
                    String s = new String(new char[]{txt.charAt(i)});
158-
                    String hiragana = converter.convertRomajiToHiragana(converter.convertKanaToRomaji(s));
159-
                    currentMatcher.append(hiragana.charAt(0));
160-
                } else {
161-
                    currentMatcher.append(txt.charAt(i));
162-
                }
163154
            } else {
164155
                String s = current.toString();
165156
                if(!s.isEmpty())
166157
                    portions.add(s);
167-
                s = currentMatcher.toString();
168-
                if(!s.isEmpty())
169-
                    portionsMatcher.add(s);
170158
                current = new StringBuilder();
171-
                currentMatcher = new StringBuilder();
172159
                current.append(txt.charAt(i));
173-
                if(b2 == KATAKANA) {
174-
                    String katakana = new String(new char[]{txt.charAt(i)});
175-
                    String hiragana = converter.convertRomajiToHiragana(converter.convertKanaToRomaji(katakana));
176-
                    currentMatcher.append(hiragana.charAt(0));
177-
                } else {
178-
                    currentMatcher.append(txt.charAt(i));
179-
                }
180160
            }
181161
182162
            b = b2;

184164
        String str = current.toString();
185165
        if(!str.isEmpty())
186166
            portions.add(str);
187-
        str = currentMatcher.toString();
188-
        if(!str.isEmpty()) {
189-
            portionsMatcher.add(str);
190-
        }
191167
192168
        // Create a regexp to match kanji places
193169
        current = new StringBuilder();
194170
        current.append("^");
195-
        for(String s: portionsMatcher) {
171+
        for(String s: portions) {
196172
            if(Character.UnicodeBlock.of(s.charAt(0)) == CJK_UNIFIED_IDEOGRAPHS) {
197173
                current.append("(.*)");
198174
            } else {
199-
                current.append(s);
175+
                for(Character c: s.toCharArray()) {
176+
                    if(Character.UnicodeBlock.of(c) == KATAKANA) {
177+
                        current.append("[");
178+
                        current.append(c);
179+
                        current.append(converter.convertRomajiToHiragana(converter.convertKanaToRomaji(new String(new char[]{c}))));
180+
                        current.append("]");
181+
                    } else {
182+
                        current.append(c);
183+
                    }
184+
                }
200185
            }
201186
        }
202187
        current.append("$");