Vtt support for transcripts (#7537)

This commit is contained in:
Sacha Delanoue 2025-04-30 22:18:07 +02:00 committed by GitHub
parent 373cde5c0a
commit 560321cfd6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 370 additions and 43 deletions

View File

@ -78,6 +78,8 @@ public class TranscriptAdapter extends RecyclerView.Adapter<TranscriptViewholder
if (speakers.isEmpty() && (position % 5 == 0)) {
holder.viewTimecode.setVisibility(View.VISIBLE);
holder.viewTimecode.setText(timecode);
} else {
holder.viewTimecode.setVisibility(View.GONE);
}
holder.viewContent.setText(seg.getWords());
}

View File

@ -45,7 +45,6 @@ public class FeedItem implements Serializable {
private String socialInteractUrl;
private String podcastIndexTranscriptUrl;
private String podcastIndexTranscriptType;
private String podcastIndexTranscriptText;
private Transcript transcript;
private int state;
@ -456,29 +455,15 @@ public class FeedItem implements Serializable {
return podcastIndexTranscriptType;
}
public void updateTranscriptPreferredFormat(String type, String url) {
if (StringUtils.isEmpty(type) || StringUtils.isEmpty(url)) {
public void updateTranscriptPreferredFormat(String typeStr, String url) {
if (StringUtils.isEmpty(typeStr) || StringUtils.isEmpty(url)) {
return;
}
String canonicalSrr = "application/srr";
String jsonType = "application/json";
switch (type) {
case "application/json":
podcastIndexTranscriptUrl = url;
podcastIndexTranscriptType = type;
break;
case "application/srr":
case "application/srt":
case "application/x-subrip":
if (podcastIndexTranscriptUrl == null || !podcastIndexTranscriptType.equals(jsonType)) {
podcastIndexTranscriptUrl = url;
podcastIndexTranscriptType = canonicalSrr;
}
break;
default:
break;
TranscriptType type = TranscriptType.fromMime(typeStr);
TranscriptType previousType = TranscriptType.fromMime(podcastIndexTranscriptType);
if (type.priority > previousType.priority) {
podcastIndexTranscriptUrl = url;
podcastIndexTranscriptType = type.canonicalMime;
}
}
@ -490,14 +475,6 @@ public class FeedItem implements Serializable {
transcript = t;
}
public String getPodcastIndexTranscriptText() {
return podcastIndexTranscriptText;
}
public String setPodcastIndexTranscriptText(String str) {
return podcastIndexTranscriptText = str;
}
public boolean hasTranscript() {
return (podcastIndexTranscriptUrl != null);
}

View File

@ -2,8 +2,8 @@ package de.danoeh.antennapod.model.feed;
public class TranscriptSegment {
private final long startTime;
private final long endTime;
private final String words;
private long endTime;
private String words;
private final String speaker;
public TranscriptSegment(long start, long end, String w, String s) {
@ -13,6 +13,11 @@ public class TranscriptSegment {
speaker = s;
}
public void append(long newEndTime, String wordsToAppend) {
endTime = newEndTime;
words += " " + wordsToAppend;
}
public long getStartTime() {
return startTime;
}

View File

@ -0,0 +1,28 @@
package de.danoeh.antennapod.model.feed;
public enum TranscriptType {
JSON(4, "application/json"),
VTT(3, "text/vtt"),
SRT(2, "application/srt"),
NONE(0, "");
public final int priority;
public final String canonicalMime;
TranscriptType(int priority, String canonicalMime) {
this.priority = priority;
this.canonicalMime = canonicalMime;
}
public static TranscriptType fromMime(String type) {
if (type == null) {
return NONE;
}
return switch (type) {
case "application/json" -> JSON;
case "text/vtt" -> VTT;
case "application/srt", "application/srr", "application/x-subrip" -> SRT;
default -> NONE;
};
}
}

View File

@ -70,7 +70,6 @@ public class MediaDownloadedHandler implements Runnable {
if (item != null && item.getTranscriptUrl() != null) {
String transcript = TranscriptUtils.loadTranscriptFromUrl(item.getTranscriptUrl(), true);
if (!StringUtils.isEmpty(transcript)) {
item.setPodcastIndexTranscriptText(transcript);
TranscriptUtils.storeTranscript(media, transcript);
}
}

View File

@ -167,6 +167,7 @@ public class FeedUpdateWorker extends Worker {
newEpisodesNotification.showIfNeeded(getApplicationContext(), savedFeed);
}
} catch (Exception e) {
e.printStackTrace();
DBWriter.setFeedLastUpdateFailed(feed.getId(), true);
DownloadResult status = new DownloadResult(feed.getTitle(),
feed.getId(), Feed.FEEDFILETYPE_FEED, false,

View File

@ -1,5 +1,6 @@
package de.danoeh.antennapod.parser.transcript;
import de.danoeh.antennapod.model.feed.TranscriptType;
import org.apache.commons.lang3.StringUtils;
import de.danoeh.antennapod.model.feed.Transcript;
@ -8,18 +9,16 @@ public class TranscriptParser {
static final long MIN_SPAN = 5000L; // Merge short segments together to form a span of 5 seconds
static final long MAX_SPAN = 8000L; // Don't go beyond 10 seconds when merging
public static Transcript parse(String str, String type) {
public static Transcript parse(String str, String typeStr) {
if (str == null || StringUtils.isBlank(str)) {
return null;
}
if ("application/json".equals(type)) {
return JsonTranscriptParser.parse(str);
}
if ("application/srt".equals(type) || "application/srr".equals(type) || "application/x-subrip".equals(type)) {
return SrtTranscriptParser.parse(str);
}
return null;
TranscriptType type = TranscriptType.fromMime(typeStr);
return switch (type) {
case JSON -> JsonTranscriptParser.parse(str);
case VTT -> VttTranscriptParser.parse(str);
case SRT -> SrtTranscriptParser.parse(str);
default -> null;
};
}
}

View File

@ -0,0 +1,144 @@
package de.danoeh.antennapod.parser.transcript;
import androidx.annotation.NonNull;
import androidx.annotation.Nullable;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.danoeh.antennapod.model.feed.Transcript;
import de.danoeh.antennapod.model.feed.TranscriptSegment;
public class VttTranscriptParser {
private static final Pattern TIMESTAMP_PATTERN =
Pattern.compile("^(?:([0-9]{2}):)?([0-9]{2}):([0-9]{2})\\.([0-9]{3})$");
private static final Pattern VOICE_SPAN =
Pattern.compile("<v(?:\\.[^\\t\\n\\r &<>.]+)*[ \\t]([^\\n\\r&>]+)>");
private record Timings(long start, long end) {}
public static Transcript parse(String str) {
// This is basically a very light WebVTT parser.
// It uses WebVTT properties to be both exact and very light.
// We will only be parsing the WebVTT cue blocks.
if (StringUtils.isBlank(str)) {
return null;
}
// WebVTT line terminator can be \r\n, \n or \n, let's use only one
str = str.replaceAll("\r\n?", "\n");
List<String> lines = Arrays.asList(str.split("\n"));
Transcript transcript = new Transcript();
Iterator<String> iterator = lines.iterator();
Set<String> speakers = new HashSet<>();
String speaker = "";
TranscriptSegment segment = null;
// Iterate through cue blocks
while (iterator.hasNext()) {
String line = iterator.next();
if (!line.contains("-->")) {
continue;
}
Timings timings = parseCueTimings(line);
if (timings == null) {
return null; // Input is broken
}
String payload = parseCuePayload(iterator);
Matcher matcher = VOICE_SPAN.matcher(payload);
if (matcher.find()) {
speaker = matcher.group(1);
speakers.add(speaker);
}
payload = Jsoup.parse(payload).text(); // remove all HTML tags
// should we merge this segment with the previous one?
if (segment != null && segment.getSpeaker().equals(speaker)
&& timings.end - segment.getStartTime() < TranscriptParser.MAX_SPAN) {
segment.append(timings.end, payload);
} else {
if (segment != null) {
transcript.addSegment(segment);
}
segment = new TranscriptSegment(timings.start, timings.end, payload, speaker);
}
// do we have a candidate segment long enough to add it without trying to add more
if (segment.getEndTime() - segment.getStartTime() >= TranscriptParser.MIN_SPAN) {
transcript.addSegment(segment);
segment = null;
}
}
if (segment != null) {
transcript.addSegment(segment);
}
if (transcript.getSegmentCount() == 0) {
return null;
}
transcript.setSpeakers(speakers);
return transcript;
}
private static long parseIntOrNull(@Nullable String s) {
return StringUtils.isEmpty(s) ? 0 : Integer.parseInt(s);
}
private static long parseTimestamp(@NonNull String timestamp) {
Matcher matcher = TIMESTAMP_PATTERN.matcher(timestamp);
if (!matcher.matches()) {
return -1;
}
long hours = parseIntOrNull(matcher.group(1));
long minutes = parseIntOrNull(matcher.group(2));
long seconds = parseIntOrNull(matcher.group(3));
long milliseconds = parseIntOrNull(matcher.group(4));
return (hours * 60 * 60 * 1000) + (minutes * 60 * 1000) + (seconds * 1000) + milliseconds;
}
@Nullable
private static Timings parseCueTimings(@NonNull String line) {
String[] timestamps = line.split("-->");
if (timestamps.length < 2) {
return null;
}
long start = parseTimestamp(timestamps[0].trim());
long end = parseTimestamp(timestamps[1].trim().split("[ \\t]")[0]);
if (start == -1 || end == -1) {
return null;
}
return new Timings(start, end);
}
@NonNull
private static String parseCuePayload(@NonNull Iterator<String> iterator) {
StringBuilder body = new StringBuilder();
while (iterator.hasNext()) {
String line = iterator.next();
if (line.isEmpty()) {
break;
}
body.append(line.strip());
body.append(" ");
}
return body.toString().strip();
}
}

View File

@ -0,0 +1,172 @@
package de.danoeh.antennapod.parser.transcript;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.robolectric.RobolectricTestRunner;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import de.danoeh.antennapod.model.feed.Transcript;
@RunWith(RobolectricTestRunner.class)
public class VttTranscriptParserTest {
private static String vttStrSimple = """
WEBVTT
00:00.000 --> 00:02.000
Intro without speaker
00:00:02.000 --> 00:03.000
Still no speaker
00:03.389 --> 00:00:09.000
<v Speaker 1>This is the first speaker.
00:09.150 --> 00:12.123
Let's assume it's still the first speaker.
00:13.000 --> 00:15.000
<v Speaker 2>And this is the second.
00:15.000 --> 00:16.000
<v Speaker 2>Still talking.
00:16.000 --> 00:18.000
<v Speaker 2>Still same line.
00:18.000 --> 00:19.000
<v Speaker 2>New line.
00:22.000 --> 00:26.500
<v Speaker 2> Too long to collapse with previous.
00:36.000 --> 00:38.000
<v Speaker 1>And again.
Je suis speaker 1.
01:00:00.000 --> 01:00:01.000
<v Speaker 1>Still talking after one hour.""";
// This is the same content as above, but with lots more WEBVTT features
private static String vttStrComplex = """
WEBVTT
NOTE This is a note
00:00.000 --> 00:02.000
Intro without speaker
00:00:02.000 --> 00:03.000
Still no speaker
NOTE Here is an other note
First block with a speaker
00:03.389 --> 00:00:09.000
<v.first Speaker 1>This is the first speaker.
00:09.150 --> 00:12.123
Let's assume it's still the first speaker.
00:13.000 --> 00:15.000\tposition:90% align:right size:35%
<v.second.loud Speaker 2>And this is the second.</v>
00:15.000 --> 00:16.000 position:10%,line-left align:left size:35%
<v Speaker 2>Still talking.
00:16.000 --> 00:18.000
<v Speaker 2>Still same line.
00:18.000 --> 00:19.000
<v Speaker 2>New line.
00:22.000 --> 00:26.500
<v Speaker 2> Too long to collapse with previous.
00:36.000 --> 00:38.000
<v Speaker 1>And again.
<i.foreignphrase><lang fr>Je suis</lang></i> speaker 1.
After one hour
01:00:00.000 --> 01:00:01.000
<v Speaker 1>Still talking after one hour.""";
private void checkResults(Transcript result) {
assertEquals("Intro without speaker Still no speaker", result.getSegmentAtTime(0L).getWords());
assertEquals("", result.getSegmentAtTime(0L).getSpeaker());
assertEquals(3000L, result.getSegmentAtTime(0L).getEndTime());
assertEquals("This is the first speaker.", result.getSegmentAtTime(3389L).getWords());
assertEquals("Speaker 1", result.getSegmentAtTime(3389L).getSpeaker());
assertEquals(9000L, result.getSegmentAtTime(3389L).getEndTime());
assertEquals("Let's assume it's still the first speaker.", result.getSegmentAtTime(9150L).getWords());
assertEquals("Speaker 1", result.getSegmentAtTime(9150L).getSpeaker());
assertEquals(12123L, result.getSegmentAtTime(9150L).getEndTime());
assertEquals("And this is the second. Still talking. Still same line.",
result.getSegmentAtTime(13000L).getWords());
assertEquals("Speaker 2", result.getSegmentAtTime(13000L).getSpeaker());
assertEquals(18000L, result.getSegmentAtTime(13000L).getEndTime());
assertEquals("New line.", result.getSegmentAtTime(18000L).getWords());
assertEquals("Speaker 2", result.getSegmentAtTime(18000L).getSpeaker());
assertEquals(19000L, result.getSegmentAtTime(18000L).getEndTime());
assertEquals("Too long to collapse with previous.", result.getSegmentAtTime(22000L).getWords());
assertEquals("Speaker 2", result.getSegmentAtTime(22000L).getSpeaker());
assertEquals(26500L, result.getSegmentAtTime(22000L).getEndTime());
assertEquals("And again. Je suis speaker 1.", result.getSegmentAtTime(36000L).getWords());
assertEquals("Speaker 1", result.getSegmentAtTime(36000L).getSpeaker());
assertEquals(38000L, result.getSegmentAtTime(36000L).getEndTime());
assertEquals("Still talking after one hour.", result.getSegmentAtTime(10000000L).getWords());
assertEquals("Speaker 1", result.getSegmentAtTime(3600000L).getSpeaker());
assertEquals(3601000L, result.getSegmentAtTime(3600000L).getEndTime());
}
@Test
public void testParseVttSimple() {
Transcript result = VttTranscriptParser.parse(vttStrSimple);
checkResults(result);
}
@Test
public void testParseVttComplex() {
Transcript result = VttTranscriptParser.parse(vttStrComplex);
checkResults(result);
}
@Test
public void testParse() {
String type = "text/vtt";
Transcript result;
result = TranscriptParser.parse(vttStrSimple, type);
// There isn't a segment at 800L, so go backwards and get the segment at 0L
assertEquals("Intro without speaker Still no speaker", result.getSegmentAtTime(800L).getWords());
result = TranscriptParser.parse(null, type);
assertNull(result);
// blank string
result = TranscriptParser.parse("", type);
assertNull(result);
// All empty lines
result = TranscriptParser.parse("\r\n\r\n\r\n\r\n", type);
assertNull(result);
// Just plain text
result = TranscriptParser.parse("<v Speaker 1> Just text", type);
assertNull(result);
// passing the wrong type
result = TranscriptParser.parse(vttStrSimple, "application/srr");
assertNull(result);
result = TranscriptParser.parse(vttStrSimple, "unknown");
assertNull(result);
}
}