0
0
mirror of https://github.com/ankidroid/Anki-Android.git synced 2024-09-20 03:52:15 +02:00

Clean up Beolingus parser (#4704)

Two regular expressions are worth 80 lines of code.
The newly added test also worked with the old code,
except that the old code for detecting the MP3 file
could only handle English pages.
This commit is contained in:
Roland Illig 2017-09-13 23:58:51 +02:00 committed by Tim Rae
parent 711e41f3cb
commit 86fe649bba
3 changed files with 50 additions and 93 deletions

View File

@ -285,7 +285,7 @@ public class LoadPronounciationActivity extends Activity implements OnCancelList
return;
}
mPronunciationAddress = BeolingusParser.getPronounciationAddressFromTranslation(mTranslation, mSource);
mPronunciationAddress = BeolingusParser.getPronunciationAddressFromTranslation(mTranslation, mSource);
if (mPronunciationAddress.contentEquals("no")) {

View File

@ -19,114 +19,43 @@
package com.ichi2.anki.multimediacard.beolingus.parsing;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This class parses beolingus pages
*/
public class BeolingusParser {
private static String PRONUNC_STOPPER = "<img src=\"/pics/s1.png\"";
private static String MP3_STOPPER = ".mp3\">Listen";
private static final Pattern PRONUNC_PATTERN = Pattern.compile("" +
"<a href=\"([^\"]+)\"[^>]*>" +
"<img src=\"/pics/s1[.]png\"[^>]*title=\"([^\"]+)\"[^>]*>");
private static final Pattern MP3_PATTERN = Pattern.compile("href=\"([^\"]+\\.mp3)\">");
/**
* @param translationHtml = html page from beolingus, with translation of the word we search
* @param wordToSearchFor
* @return "no" or http address of the page with translation First this function searches for the picture as
* described above, this picture is in the pronunciation link. Then picture title is being compared to the
* word we search. If they match, means word found, and we have to go back in text, from image, inside the
* link, <a href="... and find there the address with pronunciation page, which is returned
* @param html HTML page from beolingus, with translation of the word we search
* @return {@code "no"} or the pronunciation URL
*/
public static String getPronounciationAddressFromTranslation(String translationHtml, String wordToSearchFor) {
String pronounciationIndicator = PRONUNC_STOPPER;
if (!translationHtml.contains(pronounciationIndicator)) {
return "no";
public static String getPronunciationAddressFromTranslation(String html, String wordToSearchFor) {
Matcher m = PRONUNC_PATTERN.matcher(html);
while (m.find()) {
if (m.group(2).equals(wordToSearchFor)) {
return "http://dict.tu-chemnitz.de" + m.group(1);
}
}
int indIndicator = 0;
do {
indIndicator = translationHtml.indexOf(pronounciationIndicator, indIndicator + 1);
if (indIndicator == -1) {
return "no";
}
String title = "title=\"";
int indTitle = translationHtml.indexOf(title, indIndicator);
if (indTitle == -1) {
return "no";
}
int indNextQuote = translationHtml.indexOf("\"", indTitle + title.length());
if (indNextQuote == -1) {
return "no";
}
// Must be equal to the word translating
String titleValue = translationHtml.substring(indTitle + title.length(), indNextQuote);
if (!titleValue.contentEquals(wordToSearchFor)) {
continue;
}
break;
// indIndicator is pointing to the right one indicator!
} while (true);
String href = "href=\"";
// Rolling back for the reference
while (indIndicator > 0) {
indIndicator -= 1;
if (!translationHtml.substring(indIndicator, indIndicator + href.length()).contentEquals(href)) {
continue;
}
break;
// indIndicator contains where href starts;
}
int indNextQuote = translationHtml.indexOf("\"", indIndicator + href.length());
if (indNextQuote == -1) {
return "no";
}
String pronounciationAddress = translationHtml.substring(indIndicator + href.length(), indNextQuote);
return "http://dict.tu-chemnitz.de" + pronounciationAddress;
return "no";
}
// It searches for a link to mp3 file
// First "mp3" is found, than it takes all the address, going before it.
/**
* @param pronunciationPageHtml
* @return "no" is returned or the http address of the mp3 file
* @return {@code "no"}, or the http address of the mp3 file
*/
public static String getMp3AddressFromPronounciation(String pronunciationPageHtml) {
if (pronunciationPageHtml.startsWith("FAILED")) {
return "no";
Matcher m = MP3_PATTERN.matcher(pronunciationPageHtml);
if (m.find()) {
return "http://dict.tu-chemnitz.de" + m.group(1);
}
String mp3 = MP3_STOPPER;
if (!pronunciationPageHtml.contains(mp3)) {
return "no";
}
int indMp3 = pronunciationPageHtml.indexOf(mp3);
int indAddrEnd = indMp3 + ".mp3".length();
int addrStart = 0;
// Back to find the address start;
while (indMp3 > 0) {
indMp3 -= 1;
if (pronunciationPageHtml.charAt(indMp3) == '\"') {
addrStart = indMp3 + 1;
break;
}
}
return "http://dict.tu-chemnitz.de" + pronunciationPageHtml.substring(addrStart, indAddrEnd);
return "no";
}
}

View File

@ -0,0 +1,28 @@
package com.ichi2.anki.multimediacard.beolingus.parsing;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
public class BeolingusParserTest {
@Test
public void testPronunciation() {
String html = ""
+ "<a href=\"/dings.cgi?speak=de/0/7/52qA5FttGIU;text=Wasser\" "
+ "onclick=\"return s(this)\" onmouseover=\"return u('Wasser')\">"
+ "<img src=\"/pics/s1.png\" width=\"16\" height=\"16\" "
+ "alt=\"[anhören]\" title=\"Wasser\" border=\"0\" align=\"top\" /></a>";
String pronunciationUrl = BeolingusParser.getPronunciationAddressFromTranslation(html, "Wasser");
assertEquals("http://dict.tu-chemnitz.de/dings.cgi?speak=de/0/7/52qA5FttGIU;text=Wasser", pronunciationUrl);
}
@Test
public void testMp3() {
String html = "<td><a href=\"/speak-de/0/7/52qA5FttGIU.mp3\">Mit Ihrem";
String mp3 = BeolingusParser.getMp3AddressFromPronounciation(html);
assertEquals("http://dict.tu-chemnitz.de/speak-de/0/7/52qA5FttGIU.mp3", mp3);
}
}