Highlight plain-text links in episode description (#7581)

This commit is contained in:
viariable 2025-07-20 16:31:09 +02:00 committed by GitHub
parent 55d3b743d1
commit 85313e28b6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 301 additions and 0 deletions

View File

@ -0,0 +1,145 @@
package de.danoeh.antennapod.ui.cleaner;
import androidx.annotation.NonNull;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class PlainTextLinksConverter {
private static final Pattern HTTP_LINK_REGEX = Pattern.compile(
"(?:https?://(?:www\\.)?|www\\.)" // http(s)://[www.] OR www.
+ "[-a-zA-Z0-9@:%._+~#=]{1,256}" // Domain name
+ "\\.[a-zA-Z]{2,6}\\b" // Top-level domain
+ "[-a-zA-Z0-9@:%_+.*~#?!&$/=()\\[\\],;]*", // Path, query params
Pattern.CASE_INSENSITIVE
);
protected static final List<String> NOT_ALLOWED_END_CHARS = List.of(
".", ",", ";", ":", "?", "!", ")", "(", "[", "]", "-", "_", "~", "#", "@", "$", "*", "+");
private static final String STARTS_WITH_HTTP = "(?i)https?://.*";
private static final String ANCHOR_TAG = "a";
private static final String ANCHOR_ADDRESS = "href";
/**
* Provided text can be an HTML document or plain text.
* It may contain a mixture of plain-text links and HTML links.
* Only plain-text links will be converted to HTML {@code <a>} tags.
*/
public static String convertLinksToHtml(String text) {
if (text == null || text.isEmpty()) {
return text;
}
try {
Document doc = Jsoup.parse(text);
convertLinksToHtml(doc);
return doc.body().html();
} catch (Exception e) {
return text;
}
}
public static void convertLinksToHtml(Document doc) {
if (doc == null) {
return;
}
NodeTraversor.traverse(new LinkConvertingVisitor(), doc.body());
}
/**
* Ensures that URLs are only converted if they are not already part of an existing anchor tag.
* Document structure remains untouched, logic affects only {@link TextNode} - leaf element with no tags in it.
* One {@link TextNode} is replaced with multiple {@link Element}s:
* <li>{@link TextNode} with text before the link</li>
* <li>{@link Element} with the link tag</li>
* <li>{@link TextNode} with text after the link</li>
*/
private static class LinkConvertingVisitor implements NodeVisitor {
@Override
public void head(@NonNull Node node, int depth) {
if (!(node instanceof TextNode textNode)) {
return;
} else if (isInsideAnchor(textNode)) {
return;
}
String originalText = textNode.getWholeText();
Matcher matcher = HTTP_LINK_REGEX.matcher(originalText);
if (!matcher.find()) {
return;
}
List<Node> newNodes = new ArrayList<>();
int lastEnd = 0;
matcher.reset();
while (matcher.find()) {
String url = matcher.group();
if (endsWithPunctuation(url)) {
continue;
}
if (matcher.start() > lastEnd) {
newNodes.add(new TextNode(originalText.substring(lastEnd, matcher.start())));
}
newNodes.add(link(url));
lastEnd = matcher.end();
}
if (lastEnd < originalText.length()) {
newNodes.add(new TextNode(originalText.substring(lastEnd)));
}
if (!newNodes.isEmpty()) {
Node parent = textNode.parent();
if (parent instanceof Element parentElement) {
int index = textNode.siblingIndex();
textNode.remove();
parentElement.insertChildren(index, newNodes);
}
}
}
private static Element link(String detectedUrl) {
var url = detectedUrl;
if (!detectedUrl.matches(STARTS_WITH_HTTP)) {
url = "https://" + url;
}
return new Element(ANCHOR_TAG).attr(ANCHOR_ADDRESS, url).text(detectedUrl);
}
@Override
public void tail(@NonNull Node node, int depth) {
//not needed
}
}
private static boolean isInsideAnchor(Node node) {
Node current = node;
while (current != null) {
if (current instanceof Element currentElement) {
if (ANCHOR_TAG.equalsIgnoreCase(currentElement.tagName())) {
return true;
}
}
current = current.parent();
}
return false;
}
private static boolean endsWithPunctuation(String url) {
for (String endChar : NOT_ALLOWED_END_CHARS) {
if (url.endsWith(endChar)) {
return true;
}
}
return false;
}
}

View File

@ -76,6 +76,7 @@ public class ShownotesCleaner {
/**
* Applies an app-specific CSS stylesheet and adds timecode links (optional).
* Also converts plain-text links to HTML links.
* <p/>
* This method does NOT change the original shownotes string of the shownotesProvider object and it should
* also not be changed by the caller.
@ -99,6 +100,7 @@ public class ShownotesCleaner {
Document document = Jsoup.parse(shownotes);
cleanCss(document);
document.head().appendElement("style").attr("type", "text/css").text(webviewStyle);
PlainTextLinksConverter.convertLinksToHtml(document);
addTimecodes(document);
document.body().attr("dir", "auto");
return document.toString();

View File

@ -0,0 +1,154 @@
package de.danoeh.antennapod.ui.cleaner;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static de.danoeh.antennapod.ui.cleaner.PlainTextLinksConverter.NOT_ALLOWED_END_CHARS;
import org.junit.Test;
import java.util.List;
public class PlainTextLinksConverterTest {
@Test
public void testConvertPlainTextLinksToHtml() {
final String link1 = "https://url.to/link";
final String textWithLink = "text " + link1;
assertEquals("text " + makeLinkHtml(link1), PlainTextLinksConverter.convertLinksToHtml(textWithLink));
final String link2 = "https://t.me/link";
final String textWithLink2 = "text " + link2;
assertEquals("text " + makeLinkHtml(link2), PlainTextLinksConverter.convertLinksToHtml(textWithLink2));
final String text = "artist here: www.example.com";
final String expected = "artist here: <a href=\"https://www.example.com\">www.example.com</a>";
assertEquals(expected, PlainTextLinksConverter.convertLinksToHtml(text));
final String textWithTwoLinks = "text " + link1 + " and " + link2;
final String expectedTwoLinks = "text " + makeLinkHtml(link1) + " and " + makeLinkHtml(link2);
assertEquals(expectedTwoLinks, PlainTextLinksConverter.convertLinksToHtml(textWithTwoLinks));
final String textWithMixturePlainTextAndHtml = "text " + link1 + " and " + makeLinkHtml(link2);
final String expectedMixture = "text " + makeLinkHtml(link1) + " and " + makeLinkHtml(link2);
assertEquals(expectedMixture, PlainTextLinksConverter.convertLinksToHtml(textWithMixturePlainTextAndHtml));
final String textWithSpecialChars = "text'" + link1 + " and=" + link2;
final String expectedWithSpecialChars = "text'" + makeLinkHtml(link1) + " and=" + makeLinkHtml(link2);
assertEquals(expectedWithSpecialChars, PlainTextLinksConverter.convertLinksToHtml(textWithSpecialChars));
final String linkWithParams = "http://t.me/link#mark?param1=1&param2=true;param3=true";
final String textWithParams = "text " + linkWithParams + " after-text";
final String expectedWithParams = "text " + makeLinkHtml(linkWithParams) + " after-text";
assertEquals(expectedWithParams, PlainTextLinksConverter.convertLinksToHtml(textWithParams));
final String linkWithComma = "https://example.org/%D0%%86_(%D1%%BC,_2020";
final String textWithComma = "text " + linkWithComma;
assertEquals("text " + makeLinkHtml(linkWithComma), PlainTextLinksConverter.convertLinksToHtml(textWithComma));
final String linkWithDot = "https://www.ietf.org/rfc/rfc3986.txt";
final String textWithDot = "text " + linkWithDot;
assertEquals("text " + makeLinkHtml(linkWithDot), PlainTextLinksConverter.convertLinksToHtml(textWithDot));
final String linkWithTilda = "https://www.example.org/valid/-~.,$/url/";
final String textWithTilda = "text " + linkWithTilda;
assertEquals("text " + makeLinkHtml(linkWithTilda), PlainTextLinksConverter.convertLinksToHtml(textWithTilda));
final String linkWithExclamation = "http://www.example.com/index.php?id=123&v=wall#!/index.php?id=234";
final String textWithExclamation = "text " + linkWithExclamation;
assertEquals("text " + makeLinkHtml(linkWithExclamation),
PlainTextLinksConverter.convertLinksToHtml(textWithExclamation));
final String linkWithBrackets = "http://www.example.com/index.php?bar[]=1&bar[]=2";
final String textWithBrackets = "text " + linkWithBrackets;
assertEquals("text " + makeLinkHtml(linkWithBrackets),
PlainTextLinksConverter.convertLinksToHtml(textWithBrackets));
final String linkWithAsterisk = "https://archive.org/web/*/http://www.example.com/";
final String textWithAsterisk = "text " + linkWithAsterisk;
assertEquals("text " + makeLinkHtml(linkWithAsterisk),
PlainTextLinksConverter.convertLinksToHtml(textWithAsterisk));
}
@Test
public void testBrokenLinksAreNotCreated() {
final String linkWithBrackets = "Sign up now (http://example.com/abc)";
assertEquals(linkWithBrackets, PlainTextLinksConverter.convertLinksToHtml(linkWithBrackets));
final String linkWithBrackets2 = "Sign up now (http://example.com/abc)! please";
assertEquals(linkWithBrackets2, PlainTextLinksConverter.convertLinksToHtml(linkWithBrackets2));
final String linkWithDot = "To read on, visit https://example.com.";
assertEquals(linkWithDot, PlainTextLinksConverter.convertLinksToHtml(linkWithDot));
//we choose to ignore links like this, even though they are valid
final String validLinkIgnored = "Visit https://example.com/wiki_(url+rules)";
assertEquals(validLinkIgnored, PlainTextLinksConverter.convertLinksToHtml(validLinkIgnored));
final String link = "https://example.com/abc";
NOT_ALLOWED_END_CHARS.forEach(end ->
assertEquals(link + end, PlainTextLinksConverter.convertLinksToHtml(link + end))
);
final String firstLinkIgnored = "(" + link + ") and " + link;
assertEquals("(https://example.com/abc) and " + makeLinkHtml(link),
PlainTextLinksConverter.convertLinksToHtml(firstLinkIgnored));
final String secondLinkIgnored = "text " + link + " and (" + link + ")";
assertEquals("text " + makeLinkHtml(link) + " and (https://example.com/abc)",
PlainTextLinksConverter.convertLinksToHtml(secondLinkIgnored));
final String middleLinkIgnored = "text " + link + " and (" + link + ") and " + link;
assertEquals("text " + makeLinkHtml(link) + " and (https://example.com/abc) and " + makeLinkHtml(link),
PlainTextLinksConverter.convertLinksToHtml(middleLinkIgnored));
}
@Test
public void testExistingLinksArePreserved() {
var links = List.of(
"Click <a alt=\"abc\" href=\"http://url.to/link\">http://url.to/link, this link</a>",
"<a href=\"http://domain.org/link\">domain.org</a>",
"you can find it on <a href=\"http://xy.org\">our new website http://xy.org</a>",
"you can find it on <a href=\"http://xy.org/newlanding\">our new website http://xy.org</a>",
"<p><img src=\"https://url.to/i.jpg\" alt=\"https://url.to/i.jpg\"></p>",
"text \n<audio src=\"https://url.to/i.mp3\" alt=\"https://url.to/i.mp3\">\n text \n</audio>",
"<a href=\"https://example.com/p/ai-fakers?utm_source=example&amp;utm_medium=email\">AI interview</a> - <em>01:57:01</em>",
"sign up for our premium feed here! <a href=\"https://www.example.com/url?q=https://example.com/join&amp;source=gmail-imap&amp;ust=123&amp;usg=AOvVaw123gzEv9s9\"><strong>https://example.com/join</strong></a>",
"you can do so here:<a href=\"https://www.example.com/url?q=https://example.com/button&amp;source=gmail-imap&amp;ust=123&amp;usg=AOvV123jw--CX123tATY\"><strong>https://example.com/button</strong></a>",
"LINKS:<a href=\"https://www.example.com/url?q=https://example.org/&amp;source=gmail-imap&amp;ust=123&amp;usg=AOvVa123GJxenALD\"><strong>Example</strong></a>",
"<a href=\"https://www.example.com/url?q=https://example.org/buttons/ask-me-chili-cheese-fries&amp;source=gmail-imap&amp;ust=123&amp;usg=AOvVaw2oFNwzuvrfrokwHf6zq1P4\"><strong>Example</strong></a>",
"<p><a href=\"https://example.com/media/FN_123zV2i?format=png&amp;name=900x900\">A picture of the photo in question</a></p>",
"<a href=\"https://www.example.com/redirect?event=video_description&amp;redir_token=123l&amp;q=https%3A%2F%2Fexample.com%2Fshop%2Fbook%2F&amp;v=4iOzkYTrjzg\">https://example.com/shop/book/</a>",
"<a href=\"https://www.example.com/redirect?event=video_description&amp;redir_token=123Ws&amp;q=https%3A%2F%2Fexample.me%2FyH6x%2Fgx5ywe7g&amp;v=yIbY7x5zQO8\">https://example.me/yH6x/gx5ywe7g</a>",
""
);
links.forEach(link -> assertEquals(link, PlainTextLinksConverter.convertLinksToHtml(link)));
}
@Test
public void testConvertToHtmlWhenNoLinksAreDetected() {
assertNull(PlainTextLinksConverter.convertLinksToHtml((String) null));
assertEquals("", PlainTextLinksConverter.convertLinksToHtml(""));
final String text = "plain text";
assertEquals(text, PlainTextLinksConverter.convertLinksToHtml(text));
final String specialCharacters = "text with ' special \" characters !@#$%^&*()<>?123";
var expected = specialCharacters.replace("&", "&amp;");
expected = expected.replace("<", "&lt;");
expected = expected.replace(">", "&gt;");
assertEquals(expected, PlainTextLinksConverter.convertLinksToHtml(specialCharacters));
final String textWithDots = "\"Text With...Dots Works\"";
assertEquals(textWithDots, PlainTextLinksConverter.convertLinksToHtml(textWithDots));
}
/**
* Adds {@code <a href>..</a>} around provided string
*/
private static String makeLinkHtml(String plain) {
if (plain == null || plain.isEmpty()) {
return "";
}
String encodedPlain = plain.replace("&", "&amp;");
return "<a href=\"" + encodedPlain + "\">" + encodedPlain + "</a>";
}
}