Vtt support for transcripts (#7537)

2025-12-01 12:31:45 +00:00 · 2025-04-30 22:18:07 +02:00
parent 373cde5c0a
commit 560321cfd6
9 changed files with 370 additions and 43 deletions
--- a/app/src/main/java/de/danoeh/antennapod/ui/screen/playback/TranscriptAdapter.java
+++ b/app/src/main/java/de/danoeh/antennapod/ui/screen/playback/TranscriptAdapter.java
@ -78,6 +78,8 @@ public class TranscriptAdapter extends RecyclerView.Adapter<TranscriptViewholder
            if (speakers.isEmpty() && (position % 5 == 0)) {
                holder.viewTimecode.setVisibility(View.VISIBLE);
                holder.viewTimecode.setText(timecode);
+            } else {
+                holder.viewTimecode.setVisibility(View.GONE);
            }
            holder.viewContent.setText(seg.getWords());
        }
--- a/model/src/main/java/de/danoeh/antennapod/model/feed/FeedItem.java
+++ b/model/src/main/java/de/danoeh/antennapod/model/feed/FeedItem.java
@ -45,7 +45,6 @@ public class FeedItem implements Serializable {
    private String socialInteractUrl;
    private String podcastIndexTranscriptUrl;
    private String podcastIndexTranscriptType;
-    private String podcastIndexTranscriptText;
    private Transcript transcript;

    private int state;
@ -456,29 +455,15 @@ public class FeedItem implements Serializable {
        return podcastIndexTranscriptType;
    }

-    public void updateTranscriptPreferredFormat(String type, String url) {
-        if (StringUtils.isEmpty(type) || StringUtils.isEmpty(url)) {
+    public void updateTranscriptPreferredFormat(String typeStr, String url) {
+        if (StringUtils.isEmpty(typeStr) || StringUtils.isEmpty(url)) {
            return;
        }
-
-        String canonicalSrr = "application/srr";
-        String jsonType = "application/json";
-
-        switch (type) {
-            case "application/json":
-                podcastIndexTranscriptUrl = url;
-                podcastIndexTranscriptType = type;
-                break;
-            case "application/srr":
-            case "application/srt":
-            case "application/x-subrip":
-                if (podcastIndexTranscriptUrl == null || !podcastIndexTranscriptType.equals(jsonType)) {
-                    podcastIndexTranscriptUrl = url;
-                    podcastIndexTranscriptType = canonicalSrr;
-                }
-                break;
-            default:
-                break;
+        TranscriptType type = TranscriptType.fromMime(typeStr);
+        TranscriptType previousType = TranscriptType.fromMime(podcastIndexTranscriptType);
+        if (type.priority > previousType.priority) {
+            podcastIndexTranscriptUrl = url;
+            podcastIndexTranscriptType = type.canonicalMime;
        }
    }

@ -490,14 +475,6 @@ public class FeedItem implements Serializable {
        transcript = t;
    }

-    public String getPodcastIndexTranscriptText() {
-        return podcastIndexTranscriptText;
-    }
-
-    public String setPodcastIndexTranscriptText(String str) {
-        return podcastIndexTranscriptText = str;
-    }
-
    public boolean hasTranscript() {
        return (podcastIndexTranscriptUrl != null);
    }
--- a/model/src/main/java/de/danoeh/antennapod/model/feed/TranscriptSegment.java
+++ b/model/src/main/java/de/danoeh/antennapod/model/feed/TranscriptSegment.java
@ -2,8 +2,8 @@ package de.danoeh.antennapod.model.feed;

 public class TranscriptSegment {
    private final long startTime;
-    private final long endTime;
-    private final String words;
+    private long endTime;
+    private String words;
    private final String speaker;

    public TranscriptSegment(long start, long end, String w, String s) {
@ -13,6 +13,11 @@ public class TranscriptSegment {
        speaker = s;
    }

+    public void append(long newEndTime, String wordsToAppend) {
+        endTime = newEndTime;
+        words += " " + wordsToAppend;
+    }
+
    public long getStartTime() {
        return startTime;
    }
--- a/model/src/main/java/de/danoeh/antennapod/model/feed/TranscriptType.java
+++ b/model/src/main/java/de/danoeh/antennapod/model/feed/TranscriptType.java
@ -0,0 +1,28 @@
+package de.danoeh.antennapod.model.feed;
+
+public enum TranscriptType {
+    JSON(4, "application/json"),
+    VTT(3, "text/vtt"),
+    SRT(2, "application/srt"),
+    NONE(0, "");
+
+    public final int priority;
+    public final String canonicalMime;
+
+    TranscriptType(int priority, String canonicalMime) {
+        this.priority = priority;
+        this.canonicalMime = canonicalMime;
+    }
+
+    public static TranscriptType fromMime(String type) {
+        if (type == null) {
+            return NONE;
+        }
+        return switch (type) {
+            case "application/json" -> JSON;
+            case "text/vtt" -> VTT;
+            case "application/srt", "application/srr", "application/x-subrip" -> SRT;
+            default -> NONE;
+        };
+    }
+}
--- a/net/download/service/src/main/java/de/danoeh/antennapod/net/download/service/episode/MediaDownloadedHandler.java
+++ b/net/download/service/src/main/java/de/danoeh/antennapod/net/download/service/episode/MediaDownloadedHandler.java
@ -70,7 +70,6 @@ public class MediaDownloadedHandler implements Runnable {
            if (item != null && item.getTranscriptUrl() != null) {
                String transcript = TranscriptUtils.loadTranscriptFromUrl(item.getTranscriptUrl(), true);
                if (!StringUtils.isEmpty(transcript)) {
-                    item.setPodcastIndexTranscriptText(transcript);
                    TranscriptUtils.storeTranscript(media, transcript);
                }
            }
--- a/net/download/service/src/main/java/de/danoeh/antennapod/net/download/service/feed/FeedUpdateWorker.java
+++ b/net/download/service/src/main/java/de/danoeh/antennapod/net/download/service/feed/FeedUpdateWorker.java
@ -167,6 +167,7 @@ public class FeedUpdateWorker extends Worker {
                        newEpisodesNotification.showIfNeeded(getApplicationContext(), savedFeed);
                    }
                } catch (Exception e) {
+                    e.printStackTrace();
                    DBWriter.setFeedLastUpdateFailed(feed.getId(), true);
                    DownloadResult status = new DownloadResult(feed.getTitle(),
                            feed.getId(), Feed.FEEDFILETYPE_FEED, false,
--- a/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/TranscriptParser.java
+++ b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/TranscriptParser.java
@ -1,5 +1,6 @@
 package de.danoeh.antennapod.parser.transcript;

+import de.danoeh.antennapod.model.feed.TranscriptType;
 import org.apache.commons.lang3.StringUtils;

 import de.danoeh.antennapod.model.feed.Transcript;
@ -8,18 +9,16 @@ public class TranscriptParser {
    static final long MIN_SPAN = 5000L; // Merge short segments together to form a span of 5 seconds
    static final long MAX_SPAN = 8000L; // Don't go beyond 10 seconds when merging

-    public static Transcript parse(String str, String type) {
+    public static Transcript parse(String str, String typeStr) {
        if (str == null || StringUtils.isBlank(str)) {
            return null;
        }
-
-        if ("application/json".equals(type)) {
-            return JsonTranscriptParser.parse(str);
-        }
-
-        if ("application/srt".equals(type) || "application/srr".equals(type) || "application/x-subrip".equals(type)) {
-            return SrtTranscriptParser.parse(str);
-        }
-        return null;
+        TranscriptType type = TranscriptType.fromMime(typeStr);
+        return switch (type) {
+            case JSON -> JsonTranscriptParser.parse(str);
+            case VTT -> VttTranscriptParser.parse(str);
+            case SRT -> SrtTranscriptParser.parse(str);
+            default -> null;
+        };
    }
 }
--- a/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/VttTranscriptParser.java
+++ b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/VttTranscriptParser.java
@ -0,0 +1,144 @@
+package de.danoeh.antennapod.parser.transcript;
+
+import androidx.annotation.NonNull;
+import androidx.annotation.Nullable;
+
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.Jsoup;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import de.danoeh.antennapod.model.feed.Transcript;
+import de.danoeh.antennapod.model.feed.TranscriptSegment;
+
+public class VttTranscriptParser {
+    private static final Pattern TIMESTAMP_PATTERN =
+            Pattern.compile("^(?:([0-9]{2}):)?([0-9]{2}):([0-9]{2})\\.([0-9]{3})$");
+
+    private static final Pattern VOICE_SPAN =
+            Pattern.compile("<v(?:\\.[^\\t\\n\\r &<>.]+)*[ \\t]([^\\n\\r&>]+)>");
+
+    private record Timings(long start, long end) {}
+
+    public static Transcript parse(String str) {
+        // This is basically a very light WebVTT parser.
+        // It uses WebVTT properties to be both exact and very light.
+        // We will only be parsing the WebVTT cue blocks.
+
+        if (StringUtils.isBlank(str)) {
+            return null;
+        }
+
+        // WebVTT line terminator can be \r\n, \n or \n, let's use only one
+        str = str.replaceAll("\r\n?", "\n");
+        List<String> lines = Arrays.asList(str.split("\n"));
+
+        Transcript transcript = new Transcript();
+        Iterator<String> iterator = lines.iterator();
+        Set<String> speakers = new HashSet<>();
+        String speaker = "";
+        TranscriptSegment segment = null;
+
+        // Iterate through cue blocks
+        while (iterator.hasNext()) {
+            String line = iterator.next();
+
+            if (!line.contains("-->")) {
+                continue;
+            }
+
+            Timings timings = parseCueTimings(line);
+            if (timings == null) {
+                return null; // Input is broken
+            }
+
+            String payload = parseCuePayload(iterator);
+
+            Matcher matcher = VOICE_SPAN.matcher(payload);
+            if (matcher.find()) {
+                speaker = matcher.group(1);
+                speakers.add(speaker);
+            }
+
+            payload = Jsoup.parse(payload).text(); // remove all HTML tags
+
+            // should we merge this segment with the previous one?
+            if (segment != null && segment.getSpeaker().equals(speaker)
+                    && timings.end - segment.getStartTime() < TranscriptParser.MAX_SPAN) {
+                segment.append(timings.end, payload);
+            } else {
+                if (segment != null) {
+                    transcript.addSegment(segment);
+                }
+                segment = new TranscriptSegment(timings.start, timings.end, payload, speaker);
+            }
+
+            // do we have a candidate segment long enough to add it without trying to add more
+            if (segment.getEndTime() - segment.getStartTime() >= TranscriptParser.MIN_SPAN) {
+                transcript.addSegment(segment);
+                segment = null;
+            }
+        }
+
+        if (segment != null) {
+            transcript.addSegment(segment);
+        }
+
+        if (transcript.getSegmentCount() == 0) {
+            return null;
+        }
+        transcript.setSpeakers(speakers);
+        return transcript;
+    }
+
+    private static long parseIntOrNull(@Nullable String s) {
+        return StringUtils.isEmpty(s) ? 0 : Integer.parseInt(s);
+    }
+
+    private static long parseTimestamp(@NonNull String timestamp) {
+        Matcher matcher = TIMESTAMP_PATTERN.matcher(timestamp);
+        if (!matcher.matches()) {
+            return -1;
+        }
+        long hours = parseIntOrNull(matcher.group(1));
+        long minutes = parseIntOrNull(matcher.group(2));
+        long seconds = parseIntOrNull(matcher.group(3));
+        long milliseconds = parseIntOrNull(matcher.group(4));
+        return (hours * 60 * 60 * 1000) + (minutes * 60 * 1000) + (seconds * 1000) + milliseconds;
+    }
+
+    @Nullable
+    private static Timings parseCueTimings(@NonNull String line) {
+        String[] timestamps = line.split("-->");
+        if (timestamps.length < 2) {
+            return null;
+        }
+        long start = parseTimestamp(timestamps[0].trim());
+        long end = parseTimestamp(timestamps[1].trim().split("[ \\t]")[0]);
+        if (start == -1 || end == -1) {
+            return null;
+        }
+        return new Timings(start, end);
+    }
+
+    @NonNull
+    private static String parseCuePayload(@NonNull Iterator<String> iterator) {
+        StringBuilder body = new StringBuilder();
+        while (iterator.hasNext()) {
+            String line = iterator.next();
+            if (line.isEmpty()) {
+                break;
+            }
+            body.append(line.strip());
+            body.append(" ");
+        }
+        return body.toString().strip();
+    }
+
+}
--- a/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/VttTranscriptParserTest.java
+++ b/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/VttTranscriptParserTest.java
@ -0,0 +1,172 @@
+package de.danoeh.antennapod.parser.transcript;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.robolectric.RobolectricTestRunner;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import de.danoeh.antennapod.model.feed.Transcript;
+
+@RunWith(RobolectricTestRunner.class)
+public class VttTranscriptParserTest {
+
+    private static String vttStrSimple = """
+        WEBVTT
+
+        00:00.000 --> 00:02.000
+        Intro without speaker
+
+        00:00:02.000 --> 00:03.000
+        Still no speaker
+
+        00:03.389 --> 00:00:09.000
+        <v Speaker 1>This is the first speaker.
+
+        00:09.150 --> 00:12.123
+        Let's assume it's still the first speaker.
+
+        00:13.000 --> 00:15.000
+        <v Speaker 2>And this is the second.
+
+        00:15.000 --> 00:16.000
+        <v Speaker 2>Still talking.
+
+        00:16.000 --> 00:18.000
+        <v Speaker 2>Still same line.
+
+        00:18.000 --> 00:19.000
+        <v Speaker 2>New line.
+
+        00:22.000 --> 00:26.500
+        <v Speaker 2> Too long to collapse with previous.
+
+        00:36.000 --> 00:38.000
+        <v Speaker 1>And again.
+        Je suis speaker 1.
+
+        01:00:00.000 --> 01:00:01.000
+        <v Speaker 1>Still talking after one hour.""";
+
+    // This is the same content as above, but with lots more WEBVTT features
+    private static String vttStrComplex = """
+        WEBVTT
+
+        NOTE This is a note
+
+        00:00.000 --> 00:02.000
+        Intro without speaker
+
+        00:00:02.000 --> 00:03.000
+        Still no speaker
+
+        NOTE Here is an other note
+
+        First block with a speaker
+        00:03.389 --> 00:00:09.000
+        <v.first Speaker 1>This is the first speaker.
+
+        00:09.150 --> 00:12.123
+        Let's assume it's still the first speaker.
+
+        00:13.000 --> 00:15.000\tposition:90% align:right size:35%
+        <v.second.loud Speaker 2>And this is the second.</v>
+
+        00:15.000 --> 00:16.000 position:10%,line-left align:left size:35%
+        <v Speaker 2>Still talking.
+
+        00:16.000 --> 00:18.000
+        <v Speaker 2>Still same line.
+
+        00:18.000 --> 00:19.000
+        <v Speaker 2>New line.
+
+        00:22.000 --> 00:26.500
+        <v Speaker 2> Too long to collapse with previous.
+
+        00:36.000 --> 00:38.000
+        <v Speaker 1>And again.
+        <i.foreignphrase><lang fr>Je suis</lang></i> speaker 1.
+
+        After one hour
+        01:00:00.000 --> 01:00:01.000
+        <v Speaker 1>Still talking after one hour.""";
+
+    private void checkResults(Transcript result) {
+        assertEquals("Intro without speaker Still no speaker", result.getSegmentAtTime(0L).getWords());
+        assertEquals("", result.getSegmentAtTime(0L).getSpeaker());
+        assertEquals(3000L, result.getSegmentAtTime(0L).getEndTime());
+
+        assertEquals("This is the first speaker.", result.getSegmentAtTime(3389L).getWords());
+        assertEquals("Speaker 1", result.getSegmentAtTime(3389L).getSpeaker());
+        assertEquals(9000L, result.getSegmentAtTime(3389L).getEndTime());
+
+        assertEquals("Let's assume it's still the first speaker.", result.getSegmentAtTime(9150L).getWords());
+        assertEquals("Speaker 1", result.getSegmentAtTime(9150L).getSpeaker());
+        assertEquals(12123L, result.getSegmentAtTime(9150L).getEndTime());
+
+        assertEquals("And this is the second. Still talking. Still same line.",
+                result.getSegmentAtTime(13000L).getWords());
+        assertEquals("Speaker 2", result.getSegmentAtTime(13000L).getSpeaker());
+        assertEquals(18000L, result.getSegmentAtTime(13000L).getEndTime());
+
+        assertEquals("New line.", result.getSegmentAtTime(18000L).getWords());
+        assertEquals("Speaker 2", result.getSegmentAtTime(18000L).getSpeaker());
+        assertEquals(19000L, result.getSegmentAtTime(18000L).getEndTime());
+
+        assertEquals("Too long to collapse with previous.", result.getSegmentAtTime(22000L).getWords());
+        assertEquals("Speaker 2", result.getSegmentAtTime(22000L).getSpeaker());
+        assertEquals(26500L, result.getSegmentAtTime(22000L).getEndTime());
+
+        assertEquals("And again. Je suis speaker 1.", result.getSegmentAtTime(36000L).getWords());
+        assertEquals("Speaker 1", result.getSegmentAtTime(36000L).getSpeaker());
+        assertEquals(38000L, result.getSegmentAtTime(36000L).getEndTime());
+
+        assertEquals("Still talking after one hour.", result.getSegmentAtTime(10000000L).getWords());
+        assertEquals("Speaker 1", result.getSegmentAtTime(3600000L).getSpeaker());
+        assertEquals(3601000L, result.getSegmentAtTime(3600000L).getEndTime());
+    }
+
+    @Test
+    public void testParseVttSimple() {
+        Transcript result = VttTranscriptParser.parse(vttStrSimple);
+        checkResults(result);
+    }
+
+    @Test
+    public void testParseVttComplex() {
+        Transcript result = VttTranscriptParser.parse(vttStrComplex);
+        checkResults(result);
+    }
+
+    @Test
+    public void testParse() {
+        String type = "text/vtt";
+        Transcript result;
+
+        result = TranscriptParser.parse(vttStrSimple, type);
+        // There isn't a segment at 800L, so go backwards and get the segment at 0L
+        assertEquals("Intro without speaker Still no speaker", result.getSegmentAtTime(800L).getWords());
+
+        result = TranscriptParser.parse(null, type);
+        assertNull(result);
+
+        // blank string
+        result = TranscriptParser.parse("", type);
+        assertNull(result);
+
+        // All empty lines
+        result = TranscriptParser.parse("\r\n\r\n\r\n\r\n", type);
+        assertNull(result);
+
+        // Just plain text
+        result = TranscriptParser.parse("<v Speaker 1> Just text", type);
+        assertNull(result);
+
+        // passing the wrong type
+        result = TranscriptParser.parse(vttStrSimple, "application/srr");
+        assertNull(result);
+        result = TranscriptParser.parse(vttStrSimple, "unknown");
+        assertNull(result);
+    }
+}
+