mirror of
https://github.com/AntennaPod/AntennaPod.git
synced 2025-10-29 03:36:21 +00:00
Vtt support for transcripts (#7537)
This commit is contained in:
parent
373cde5c0a
commit
560321cfd6
@ -78,6 +78,8 @@ public class TranscriptAdapter extends RecyclerView.Adapter<TranscriptViewholder
|
||||
if (speakers.isEmpty() && (position % 5 == 0)) {
|
||||
holder.viewTimecode.setVisibility(View.VISIBLE);
|
||||
holder.viewTimecode.setText(timecode);
|
||||
} else {
|
||||
holder.viewTimecode.setVisibility(View.GONE);
|
||||
}
|
||||
holder.viewContent.setText(seg.getWords());
|
||||
}
|
||||
|
||||
@ -45,7 +45,6 @@ public class FeedItem implements Serializable {
|
||||
private String socialInteractUrl;
|
||||
private String podcastIndexTranscriptUrl;
|
||||
private String podcastIndexTranscriptType;
|
||||
private String podcastIndexTranscriptText;
|
||||
private Transcript transcript;
|
||||
|
||||
private int state;
|
||||
@ -456,29 +455,15 @@ public class FeedItem implements Serializable {
|
||||
return podcastIndexTranscriptType;
|
||||
}
|
||||
|
||||
public void updateTranscriptPreferredFormat(String type, String url) {
|
||||
if (StringUtils.isEmpty(type) || StringUtils.isEmpty(url)) {
|
||||
public void updateTranscriptPreferredFormat(String typeStr, String url) {
|
||||
if (StringUtils.isEmpty(typeStr) || StringUtils.isEmpty(url)) {
|
||||
return;
|
||||
}
|
||||
|
||||
String canonicalSrr = "application/srr";
|
||||
String jsonType = "application/json";
|
||||
|
||||
switch (type) {
|
||||
case "application/json":
|
||||
podcastIndexTranscriptUrl = url;
|
||||
podcastIndexTranscriptType = type;
|
||||
break;
|
||||
case "application/srr":
|
||||
case "application/srt":
|
||||
case "application/x-subrip":
|
||||
if (podcastIndexTranscriptUrl == null || !podcastIndexTranscriptType.equals(jsonType)) {
|
||||
podcastIndexTranscriptUrl = url;
|
||||
podcastIndexTranscriptType = canonicalSrr;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
TranscriptType type = TranscriptType.fromMime(typeStr);
|
||||
TranscriptType previousType = TranscriptType.fromMime(podcastIndexTranscriptType);
|
||||
if (type.priority > previousType.priority) {
|
||||
podcastIndexTranscriptUrl = url;
|
||||
podcastIndexTranscriptType = type.canonicalMime;
|
||||
}
|
||||
}
|
||||
|
||||
@ -490,14 +475,6 @@ public class FeedItem implements Serializable {
|
||||
transcript = t;
|
||||
}
|
||||
|
||||
public String getPodcastIndexTranscriptText() {
|
||||
return podcastIndexTranscriptText;
|
||||
}
|
||||
|
||||
public String setPodcastIndexTranscriptText(String str) {
|
||||
return podcastIndexTranscriptText = str;
|
||||
}
|
||||
|
||||
public boolean hasTranscript() {
|
||||
return (podcastIndexTranscriptUrl != null);
|
||||
}
|
||||
|
||||
@ -2,8 +2,8 @@ package de.danoeh.antennapod.model.feed;
|
||||
|
||||
public class TranscriptSegment {
|
||||
private final long startTime;
|
||||
private final long endTime;
|
||||
private final String words;
|
||||
private long endTime;
|
||||
private String words;
|
||||
private final String speaker;
|
||||
|
||||
public TranscriptSegment(long start, long end, String w, String s) {
|
||||
@ -13,6 +13,11 @@ public class TranscriptSegment {
|
||||
speaker = s;
|
||||
}
|
||||
|
||||
public void append(long newEndTime, String wordsToAppend) {
|
||||
endTime = newEndTime;
|
||||
words += " " + wordsToAppend;
|
||||
}
|
||||
|
||||
public long getStartTime() {
|
||||
return startTime;
|
||||
}
|
||||
|
||||
@ -0,0 +1,28 @@
|
||||
package de.danoeh.antennapod.model.feed;
|
||||
|
||||
public enum TranscriptType {
|
||||
JSON(4, "application/json"),
|
||||
VTT(3, "text/vtt"),
|
||||
SRT(2, "application/srt"),
|
||||
NONE(0, "");
|
||||
|
||||
public final int priority;
|
||||
public final String canonicalMime;
|
||||
|
||||
TranscriptType(int priority, String canonicalMime) {
|
||||
this.priority = priority;
|
||||
this.canonicalMime = canonicalMime;
|
||||
}
|
||||
|
||||
public static TranscriptType fromMime(String type) {
|
||||
if (type == null) {
|
||||
return NONE;
|
||||
}
|
||||
return switch (type) {
|
||||
case "application/json" -> JSON;
|
||||
case "text/vtt" -> VTT;
|
||||
case "application/srt", "application/srr", "application/x-subrip" -> SRT;
|
||||
default -> NONE;
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -70,7 +70,6 @@ public class MediaDownloadedHandler implements Runnable {
|
||||
if (item != null && item.getTranscriptUrl() != null) {
|
||||
String transcript = TranscriptUtils.loadTranscriptFromUrl(item.getTranscriptUrl(), true);
|
||||
if (!StringUtils.isEmpty(transcript)) {
|
||||
item.setPodcastIndexTranscriptText(transcript);
|
||||
TranscriptUtils.storeTranscript(media, transcript);
|
||||
}
|
||||
}
|
||||
|
||||
@ -167,6 +167,7 @@ public class FeedUpdateWorker extends Worker {
|
||||
newEpisodesNotification.showIfNeeded(getApplicationContext(), savedFeed);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
DBWriter.setFeedLastUpdateFailed(feed.getId(), true);
|
||||
DownloadResult status = new DownloadResult(feed.getTitle(),
|
||||
feed.getId(), Feed.FEEDFILETYPE_FEED, false,
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package de.danoeh.antennapod.parser.transcript;
|
||||
|
||||
import de.danoeh.antennapod.model.feed.TranscriptType;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import de.danoeh.antennapod.model.feed.Transcript;
|
||||
@ -8,18 +9,16 @@ public class TranscriptParser {
|
||||
static final long MIN_SPAN = 5000L; // Merge short segments together to form a span of 5 seconds
|
||||
static final long MAX_SPAN = 8000L; // Don't go beyond 10 seconds when merging
|
||||
|
||||
public static Transcript parse(String str, String type) {
|
||||
public static Transcript parse(String str, String typeStr) {
|
||||
if (str == null || StringUtils.isBlank(str)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if ("application/json".equals(type)) {
|
||||
return JsonTranscriptParser.parse(str);
|
||||
}
|
||||
|
||||
if ("application/srt".equals(type) || "application/srr".equals(type) || "application/x-subrip".equals(type)) {
|
||||
return SrtTranscriptParser.parse(str);
|
||||
}
|
||||
return null;
|
||||
TranscriptType type = TranscriptType.fromMime(typeStr);
|
||||
return switch (type) {
|
||||
case JSON -> JsonTranscriptParser.parse(str);
|
||||
case VTT -> VttTranscriptParser.parse(str);
|
||||
case SRT -> SrtTranscriptParser.parse(str);
|
||||
default -> null;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,144 @@
|
||||
package de.danoeh.antennapod.parser.transcript;
|
||||
|
||||
import androidx.annotation.NonNull;
|
||||
import androidx.annotation.Nullable;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import de.danoeh.antennapod.model.feed.Transcript;
|
||||
import de.danoeh.antennapod.model.feed.TranscriptSegment;
|
||||
|
||||
public class VttTranscriptParser {
|
||||
private static final Pattern TIMESTAMP_PATTERN =
|
||||
Pattern.compile("^(?:([0-9]{2}):)?([0-9]{2}):([0-9]{2})\\.([0-9]{3})$");
|
||||
|
||||
private static final Pattern VOICE_SPAN =
|
||||
Pattern.compile("<v(?:\\.[^\\t\\n\\r &<>.]+)*[ \\t]([^\\n\\r&>]+)>");
|
||||
|
||||
private record Timings(long start, long end) {}
|
||||
|
||||
public static Transcript parse(String str) {
|
||||
// This is basically a very light WebVTT parser.
|
||||
// It uses WebVTT properties to be both exact and very light.
|
||||
// We will only be parsing the WebVTT cue blocks.
|
||||
|
||||
if (StringUtils.isBlank(str)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// WebVTT line terminator can be \r\n, \n or \n, let's use only one
|
||||
str = str.replaceAll("\r\n?", "\n");
|
||||
List<String> lines = Arrays.asList(str.split("\n"));
|
||||
|
||||
Transcript transcript = new Transcript();
|
||||
Iterator<String> iterator = lines.iterator();
|
||||
Set<String> speakers = new HashSet<>();
|
||||
String speaker = "";
|
||||
TranscriptSegment segment = null;
|
||||
|
||||
// Iterate through cue blocks
|
||||
while (iterator.hasNext()) {
|
||||
String line = iterator.next();
|
||||
|
||||
if (!line.contains("-->")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Timings timings = parseCueTimings(line);
|
||||
if (timings == null) {
|
||||
return null; // Input is broken
|
||||
}
|
||||
|
||||
String payload = parseCuePayload(iterator);
|
||||
|
||||
Matcher matcher = VOICE_SPAN.matcher(payload);
|
||||
if (matcher.find()) {
|
||||
speaker = matcher.group(1);
|
||||
speakers.add(speaker);
|
||||
}
|
||||
|
||||
payload = Jsoup.parse(payload).text(); // remove all HTML tags
|
||||
|
||||
// should we merge this segment with the previous one?
|
||||
if (segment != null && segment.getSpeaker().equals(speaker)
|
||||
&& timings.end - segment.getStartTime() < TranscriptParser.MAX_SPAN) {
|
||||
segment.append(timings.end, payload);
|
||||
} else {
|
||||
if (segment != null) {
|
||||
transcript.addSegment(segment);
|
||||
}
|
||||
segment = new TranscriptSegment(timings.start, timings.end, payload, speaker);
|
||||
}
|
||||
|
||||
// do we have a candidate segment long enough to add it without trying to add more
|
||||
if (segment.getEndTime() - segment.getStartTime() >= TranscriptParser.MIN_SPAN) {
|
||||
transcript.addSegment(segment);
|
||||
segment = null;
|
||||
}
|
||||
}
|
||||
|
||||
if (segment != null) {
|
||||
transcript.addSegment(segment);
|
||||
}
|
||||
|
||||
if (transcript.getSegmentCount() == 0) {
|
||||
return null;
|
||||
}
|
||||
transcript.setSpeakers(speakers);
|
||||
return transcript;
|
||||
}
|
||||
|
||||
private static long parseIntOrNull(@Nullable String s) {
|
||||
return StringUtils.isEmpty(s) ? 0 : Integer.parseInt(s);
|
||||
}
|
||||
|
||||
private static long parseTimestamp(@NonNull String timestamp) {
|
||||
Matcher matcher = TIMESTAMP_PATTERN.matcher(timestamp);
|
||||
if (!matcher.matches()) {
|
||||
return -1;
|
||||
}
|
||||
long hours = parseIntOrNull(matcher.group(1));
|
||||
long minutes = parseIntOrNull(matcher.group(2));
|
||||
long seconds = parseIntOrNull(matcher.group(3));
|
||||
long milliseconds = parseIntOrNull(matcher.group(4));
|
||||
return (hours * 60 * 60 * 1000) + (minutes * 60 * 1000) + (seconds * 1000) + milliseconds;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
private static Timings parseCueTimings(@NonNull String line) {
|
||||
String[] timestamps = line.split("-->");
|
||||
if (timestamps.length < 2) {
|
||||
return null;
|
||||
}
|
||||
long start = parseTimestamp(timestamps[0].trim());
|
||||
long end = parseTimestamp(timestamps[1].trim().split("[ \\t]")[0]);
|
||||
if (start == -1 || end == -1) {
|
||||
return null;
|
||||
}
|
||||
return new Timings(start, end);
|
||||
}
|
||||
|
||||
@NonNull
|
||||
private static String parseCuePayload(@NonNull Iterator<String> iterator) {
|
||||
StringBuilder body = new StringBuilder();
|
||||
while (iterator.hasNext()) {
|
||||
String line = iterator.next();
|
||||
if (line.isEmpty()) {
|
||||
break;
|
||||
}
|
||||
body.append(line.strip());
|
||||
body.append(" ");
|
||||
}
|
||||
return body.toString().strip();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,172 @@
|
||||
package de.danoeh.antennapod.parser.transcript;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.robolectric.RobolectricTestRunner;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNull;
|
||||
import de.danoeh.antennapod.model.feed.Transcript;
|
||||
|
||||
@RunWith(RobolectricTestRunner.class)
|
||||
public class VttTranscriptParserTest {
|
||||
|
||||
private static String vttStrSimple = """
|
||||
WEBVTT
|
||||
|
||||
00:00.000 --> 00:02.000
|
||||
Intro without speaker
|
||||
|
||||
00:00:02.000 --> 00:03.000
|
||||
Still no speaker
|
||||
|
||||
00:03.389 --> 00:00:09.000
|
||||
<v Speaker 1>This is the first speaker.
|
||||
|
||||
00:09.150 --> 00:12.123
|
||||
Let's assume it's still the first speaker.
|
||||
|
||||
00:13.000 --> 00:15.000
|
||||
<v Speaker 2>And this is the second.
|
||||
|
||||
00:15.000 --> 00:16.000
|
||||
<v Speaker 2>Still talking.
|
||||
|
||||
00:16.000 --> 00:18.000
|
||||
<v Speaker 2>Still same line.
|
||||
|
||||
00:18.000 --> 00:19.000
|
||||
<v Speaker 2>New line.
|
||||
|
||||
00:22.000 --> 00:26.500
|
||||
<v Speaker 2> Too long to collapse with previous.
|
||||
|
||||
00:36.000 --> 00:38.000
|
||||
<v Speaker 1>And again.
|
||||
Je suis speaker 1.
|
||||
|
||||
01:00:00.000 --> 01:00:01.000
|
||||
<v Speaker 1>Still talking after one hour.""";
|
||||
|
||||
// This is the same content as above, but with lots more WEBVTT features
|
||||
private static String vttStrComplex = """
|
||||
WEBVTT
|
||||
|
||||
NOTE This is a note
|
||||
|
||||
00:00.000 --> 00:02.000
|
||||
Intro without speaker
|
||||
|
||||
00:00:02.000 --> 00:03.000
|
||||
Still no speaker
|
||||
|
||||
NOTE Here is an other note
|
||||
|
||||
First block with a speaker
|
||||
00:03.389 --> 00:00:09.000
|
||||
<v.first Speaker 1>This is the first speaker.
|
||||
|
||||
00:09.150 --> 00:12.123
|
||||
Let's assume it's still the first speaker.
|
||||
|
||||
00:13.000 --> 00:15.000\tposition:90% align:right size:35%
|
||||
<v.second.loud Speaker 2>And this is the second.</v>
|
||||
|
||||
00:15.000 --> 00:16.000 position:10%,line-left align:left size:35%
|
||||
<v Speaker 2>Still talking.
|
||||
|
||||
00:16.000 --> 00:18.000
|
||||
<v Speaker 2>Still same line.
|
||||
|
||||
00:18.000 --> 00:19.000
|
||||
<v Speaker 2>New line.
|
||||
|
||||
00:22.000 --> 00:26.500
|
||||
<v Speaker 2> Too long to collapse with previous.
|
||||
|
||||
00:36.000 --> 00:38.000
|
||||
<v Speaker 1>And again.
|
||||
<i.foreignphrase><lang fr>Je suis</lang></i> speaker 1.
|
||||
|
||||
After one hour
|
||||
01:00:00.000 --> 01:00:01.000
|
||||
<v Speaker 1>Still talking after one hour.""";
|
||||
|
||||
private void checkResults(Transcript result) {
|
||||
assertEquals("Intro without speaker Still no speaker", result.getSegmentAtTime(0L).getWords());
|
||||
assertEquals("", result.getSegmentAtTime(0L).getSpeaker());
|
||||
assertEquals(3000L, result.getSegmentAtTime(0L).getEndTime());
|
||||
|
||||
assertEquals("This is the first speaker.", result.getSegmentAtTime(3389L).getWords());
|
||||
assertEquals("Speaker 1", result.getSegmentAtTime(3389L).getSpeaker());
|
||||
assertEquals(9000L, result.getSegmentAtTime(3389L).getEndTime());
|
||||
|
||||
assertEquals("Let's assume it's still the first speaker.", result.getSegmentAtTime(9150L).getWords());
|
||||
assertEquals("Speaker 1", result.getSegmentAtTime(9150L).getSpeaker());
|
||||
assertEquals(12123L, result.getSegmentAtTime(9150L).getEndTime());
|
||||
|
||||
assertEquals("And this is the second. Still talking. Still same line.",
|
||||
result.getSegmentAtTime(13000L).getWords());
|
||||
assertEquals("Speaker 2", result.getSegmentAtTime(13000L).getSpeaker());
|
||||
assertEquals(18000L, result.getSegmentAtTime(13000L).getEndTime());
|
||||
|
||||
assertEquals("New line.", result.getSegmentAtTime(18000L).getWords());
|
||||
assertEquals("Speaker 2", result.getSegmentAtTime(18000L).getSpeaker());
|
||||
assertEquals(19000L, result.getSegmentAtTime(18000L).getEndTime());
|
||||
|
||||
assertEquals("Too long to collapse with previous.", result.getSegmentAtTime(22000L).getWords());
|
||||
assertEquals("Speaker 2", result.getSegmentAtTime(22000L).getSpeaker());
|
||||
assertEquals(26500L, result.getSegmentAtTime(22000L).getEndTime());
|
||||
|
||||
assertEquals("And again. Je suis speaker 1.", result.getSegmentAtTime(36000L).getWords());
|
||||
assertEquals("Speaker 1", result.getSegmentAtTime(36000L).getSpeaker());
|
||||
assertEquals(38000L, result.getSegmentAtTime(36000L).getEndTime());
|
||||
|
||||
assertEquals("Still talking after one hour.", result.getSegmentAtTime(10000000L).getWords());
|
||||
assertEquals("Speaker 1", result.getSegmentAtTime(3600000L).getSpeaker());
|
||||
assertEquals(3601000L, result.getSegmentAtTime(3600000L).getEndTime());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseVttSimple() {
|
||||
Transcript result = VttTranscriptParser.parse(vttStrSimple);
|
||||
checkResults(result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseVttComplex() {
|
||||
Transcript result = VttTranscriptParser.parse(vttStrComplex);
|
||||
checkResults(result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParse() {
|
||||
String type = "text/vtt";
|
||||
Transcript result;
|
||||
|
||||
result = TranscriptParser.parse(vttStrSimple, type);
|
||||
// There isn't a segment at 800L, so go backwards and get the segment at 0L
|
||||
assertEquals("Intro without speaker Still no speaker", result.getSegmentAtTime(800L).getWords());
|
||||
|
||||
result = TranscriptParser.parse(null, type);
|
||||
assertNull(result);
|
||||
|
||||
// blank string
|
||||
result = TranscriptParser.parse("", type);
|
||||
assertNull(result);
|
||||
|
||||
// All empty lines
|
||||
result = TranscriptParser.parse("\r\n\r\n\r\n\r\n", type);
|
||||
assertNull(result);
|
||||
|
||||
// Just plain text
|
||||
result = TranscriptParser.parse("<v Speaker 1> Just text", type);
|
||||
assertNull(result);
|
||||
|
||||
// passing the wrong type
|
||||
result = TranscriptParser.parse(vttStrSimple, "application/srr");
|
||||
assertNull(result);
|
||||
result = TranscriptParser.parse(vttStrSimple, "unknown");
|
||||
assertNull(result);
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user