Clean up Beolingus parser (#4704)

Two regular expressions are worth 80 lines of code. The newly added test also worked with the old code, except that the old code for detecting the MP3 file could only handle English pages.
2024-09-20 03:52:15 +02:00 · 2017-09-13 23:58:51 +02:00 · 2017-09-13 23:58:51 +02:00 · 86fe649bba
commit 86fe649bba
parent 711e41f3cb
3 changed files with 50 additions and 93 deletions
--- a/AnkiDroid/src/main/java/com/ichi2/anki/multimediacard/activity/LoadPronounciationActivity.java
+++ b/AnkiDroid/src/main/java/com/ichi2/anki/multimediacard/activity/LoadPronounciationActivity.java
@ -285,7 +285,7 @@ public class LoadPronounciationActivity extends Activity implements OnCancelList
                return;
            }

-            mPronunciationAddress = BeolingusParser.getPronounciationAddressFromTranslation(mTranslation, mSource);
+            mPronunciationAddress = BeolingusParser.getPronunciationAddressFromTranslation(mTranslation, mSource);

            if (mPronunciationAddress.contentEquals("no")) {

--- a/AnkiDroid/src/main/java/com/ichi2/anki/multimediacard/beolingus/parsing/BeolingusParser.java
+++ b/AnkiDroid/src/main/java/com/ichi2/anki/multimediacard/beolingus/parsing/BeolingusParser.java
@ -19,114 +19,43 @@

 package com.ichi2.anki.multimediacard.beolingus.parsing;

+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
 /**
 * This class parses beolingus pages
 */
 public class BeolingusParser {
-    private static String PRONUNC_STOPPER = "<img src=\"/pics/s1.png\"";
-    private static String MP3_STOPPER = ".mp3\">Listen";

+    private static final Pattern PRONUNC_PATTERN = Pattern.compile("" +
+            "<a href=\"([^\"]+)\"[^>]*>" +
+            "<img src=\"/pics/s1[.]png\"[^>]*title=\"([^\"]+)\"[^>]*>");
+    private static final Pattern MP3_PATTERN = Pattern.compile("href=\"([^\"]+\\.mp3)\">");

    /**
-     * @param translationHtml = html page from beolingus, with translation of the word we search
-     * @param wordToSearchFor
-     * @return "no" or http address of the page with translation First this function searches for the picture as
-     *         described above, this picture is in the pronunciation link. Then picture title is being compared to the
-     *         word we search. If they match, means word found, and we have to go back in text, from image, inside the
-     *         link, <a href="... and find there the address with pronunciation page, which is returned
+     * @param html HTML page from beolingus, with translation of the word we search
+     * @return {@code "no"} or the pronunciation URL
     */
-    public static String getPronounciationAddressFromTranslation(String translationHtml, String wordToSearchFor) {
-        String pronounciationIndicator = PRONUNC_STOPPER;
-        if (!translationHtml.contains(pronounciationIndicator)) {
-            return "no";
+    public static String getPronunciationAddressFromTranslation(String html, String wordToSearchFor) {
+        Matcher m = PRONUNC_PATTERN.matcher(html);
+        while (m.find()) {
+            if (m.group(2).equals(wordToSearchFor)) {
+                return "http://dict.tu-chemnitz.de" + m.group(1);
+            }
        }
-
-        int indIndicator = 0;
-        do {
-            indIndicator = translationHtml.indexOf(pronounciationIndicator, indIndicator + 1);
-            if (indIndicator == -1) {
-                return "no";
-            }
-            String title = "title=\"";
-
-            int indTitle = translationHtml.indexOf(title, indIndicator);
-
-            if (indTitle == -1) {
-                return "no";
-            }
-
-            int indNextQuote = translationHtml.indexOf("\"", indTitle + title.length());
-            if (indNextQuote == -1) {
-                return "no";
-            }
-
-            // Must be equal to the word translating
-            String titleValue = translationHtml.substring(indTitle + title.length(), indNextQuote);
-
-            if (!titleValue.contentEquals(wordToSearchFor)) {
-                continue;
-            }
-
-            break;
-            // indIndicator is pointing to the right one indicator!
-        } while (true);
-
-        String href = "href=\"";
-        // Rolling back for the reference
-        while (indIndicator > 0) {
-            indIndicator -= 1;
-            if (!translationHtml.substring(indIndicator, indIndicator + href.length()).contentEquals(href)) {
-                continue;
-            }
-
-            break;
-            // indIndicator contains where href starts;
-        }
-
-        int indNextQuote = translationHtml.indexOf("\"", indIndicator + href.length());
-        if (indNextQuote == -1) {
-            return "no";
-        }
-
-        String pronounciationAddress = translationHtml.substring(indIndicator + href.length(), indNextQuote);
-
-        return "http://dict.tu-chemnitz.de" + pronounciationAddress;
+        return "no";
    }


-    // It searches for a link to mp3 file
-    // First "mp3" is found, than it takes all the address, going before it.
    /**
-     * @param pronunciationPageHtml
-     * @return "no" is returned or the http address of the mp3 file
+     * @return {@code "no"}, or the http address of the mp3 file
     */
    public static String getMp3AddressFromPronounciation(String pronunciationPageHtml) {
-        if (pronunciationPageHtml.startsWith("FAILED")) {
-            return "no";
+        Matcher m = MP3_PATTERN.matcher(pronunciationPageHtml);
+        if (m.find()) {
+            return "http://dict.tu-chemnitz.de" + m.group(1);
        }
-
-        String mp3 = MP3_STOPPER;
-
-        if (!pronunciationPageHtml.contains(mp3)) {
-            return "no";
-        }
-
-        int indMp3 = pronunciationPageHtml.indexOf(mp3);
-        int indAddrEnd = indMp3 + ".mp3".length();
-
-        int addrStart = 0;
-        // Back to find the address start;
-        while (indMp3 > 0) {
-            indMp3 -= 1;
-            if (pronunciationPageHtml.charAt(indMp3) == '\"') {
-                addrStart = indMp3 + 1;
-                break;
-            }
-
-        }
-
-        return "http://dict.tu-chemnitz.de" + pronunciationPageHtml.substring(addrStart, indAddrEnd);
-
+        return "no";
    }

 }
--- a/AnkiDroid/src/test/java/com/ichi2/anki/multimediacard/beolingus/parsing/BeolingusParserTest.java
+++ b/AnkiDroid/src/test/java/com/ichi2/anki/multimediacard/beolingus/parsing/BeolingusParserTest.java
@ -0,0 +1,28 @@
+package com.ichi2.anki.multimediacard.beolingus.parsing;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class BeolingusParserTest {
+
+    @Test
+    public void testPronunciation() {
+        String html = ""
+                + "<a href=\"/dings.cgi?speak=de/0/7/52qA5FttGIU;text=Wasser\" "
+                + "onclick=\"return s(this)\" onmouseover=\"return u('Wasser')\">"
+                + "<img src=\"/pics/s1.png\" width=\"16\" height=\"16\" "
+                + "alt=\"[anhören]\" title=\"Wasser\" border=\"0\" align=\"top\" /></a>";
+
+        String pronunciationUrl = BeolingusParser.getPronunciationAddressFromTranslation(html, "Wasser");
+        assertEquals("http://dict.tu-chemnitz.de/dings.cgi?speak=de/0/7/52qA5FttGIU;text=Wasser", pronunciationUrl);
+    }
+
+    @Test
+    public void testMp3() {
+        String html = "<td><a href=\"/speak-de/0/7/52qA5FttGIU.mp3\">Mit Ihrem";
+
+        String mp3 = BeolingusParser.getMp3AddressFromPronounciation(html);
+        assertEquals("http://dict.tu-chemnitz.de/speak-de/0/7/52qA5FttGIU.mp3", mp3);
+    }
+}