Fix and tune feed item duplicate guesser (#7979)

This commit is contained in:
Hans-Peter Lehmann 2025-09-12 21:00:56 +02:00 committed by GitHub
parent 9db9dc7732
commit 0772b4998d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 237 additions and 83 deletions

View File

@ -1,7 +1,6 @@
package de.danoeh.antennapod.storage.database;
import android.content.Context;
import android.text.TextUtils;
import android.util.Log;
import de.danoeh.antennapod.event.FeedListUpdateEvent;
import de.danoeh.antennapod.model.download.DownloadError;
@ -45,38 +44,6 @@ public abstract class FeedDatabaseWriter {
return null;
}
/**
* Get a FeedItem by its identifying value.
*/
private static FeedItem searchFeedItemByIdentifyingValue(List<FeedItem> items, FeedItem searchItem) {
for (FeedItem item : items) {
if (TextUtils.equals(item.getIdentifyingValue(), searchItem.getIdentifyingValue())) {
return item;
}
}
return null;
}
/**
* Guess if one of the items could actually mean the searched item, even if it uses another identifying value.
* This is to work around podcasters breaking their GUIDs.
*/
private static FeedItem searchFeedItemGuessDuplicate(List<FeedItem> items, FeedItem searchItem) {
// First, see if it is a well-behaving feed that contains an item with the same identifier
for (FeedItem item : items) {
if (FeedItemDuplicateGuesser.sameAndNotEmpty(item.getItemIdentifier(), searchItem.getItemIdentifier())) {
return item;
}
}
// Not found yet, start more expensive guessing
for (FeedItem item : items) {
if (FeedItemDuplicateGuesser.seemDuplicates(item, searchItem)) {
return item;
}
}
return null;
}
/**
* Adds new Feeds to the database or updates the old versions if they already exists. If another Feed with the same
* identifying value already exists, this method will add new FeedItems from the new Feed to the existing Feed.
@ -108,6 +75,9 @@ public abstract class FeedDatabaseWriter {
+ " already exists. Syncing new with existing one.");
Collections.sort(newFeed.getItems(), new FeedItemPubdateComparator());
FeedItemDuplicateGuesserPool newFeedDuplicateGuesser = new FeedItemDuplicateGuesserPool(newFeed.getItems());
FeedItemDuplicateGuesserPool savedFeedDuplicateGuesser
= new FeedItemDuplicateGuesserPool(savedFeed.getItems());
if (newFeed.getPageNr() == savedFeed.getPageNr()) {
savedFeed.updateFromOther(newFeed);
@ -128,7 +98,7 @@ public abstract class FeedDatabaseWriter {
for (int idx = 0; idx < newFeed.getItems().size(); idx++) {
final FeedItem item = newFeed.getItems().get(idx);
FeedItem possibleDuplicate = searchFeedItemGuessDuplicate(newFeed.getItems(), item);
FeedItem possibleDuplicate = newFeedDuplicateGuesser.guessDuplicate(item);
if (!newFeed.isLocalFeed() && possibleDuplicate != null && item != possibleDuplicate) {
// Canonical episode is the first one returned (usually oldest)
DBWriter.addDownloadStatus(new DownloadResult(item.getTitle(),
@ -142,9 +112,9 @@ public abstract class FeedDatabaseWriter {
continue;
}
FeedItem oldItem = searchFeedItemByIdentifyingValue(savedFeed.getItems(), item);
FeedItem oldItem = savedFeedDuplicateGuesser.findById(item);
if (!newFeed.isLocalFeed() && oldItem == null) {
oldItem = searchFeedItemGuessDuplicate(savedFeed.getItems(), item);
oldItem = savedFeedDuplicateGuesser.guessDuplicate(item);
if (oldItem != null) {
Log.d(TAG, "Repaired duplicate: " + oldItem + ", " + item);
DBWriter.addDownloadStatus(new DownloadResult(item.getTitle(),
@ -181,6 +151,7 @@ public abstract class FeedDatabaseWriter {
} else {
savedFeed.getItems().add(idx, item);
}
savedFeedDuplicateGuesser.add(item);
boolean shouldPerformNewEpisodesAction = item.getPubDate() == null
|| priorMostRecentDate == null
@ -217,7 +188,7 @@ public abstract class FeedDatabaseWriter {
Iterator<FeedItem> it = savedFeed.getItems().iterator();
while (it.hasNext()) {
FeedItem feedItem = it.next();
if (searchFeedItemByIdentifyingValue(newFeed.getItems(), feedItem) == null) {
if (newFeedDuplicateGuesser.findById(feedItem) == null) {
unlistedItems.add(feedItem);
it.remove();
}

View File

@ -69,7 +69,7 @@ public class FeedItemDuplicateGuesser {
return sameAndNotEmpty(canonicalizeTitle(item1.getTitle()), canonicalizeTitle(item2.getTitle()));
}
private static String canonicalizeTitle(String title) {
public static String canonicalizeTitle(String title) {
if (title == null) {
return "";
}

View File

@ -0,0 +1,60 @@
package de.danoeh.antennapod.storage.database;
import de.danoeh.antennapod.model.feed.FeedItem;
import org.apache.commons.lang3.StringUtils;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class FeedItemDuplicateGuesserPool {
private final Map<String, List<FeedItem>> normalizedTitles = new HashMap<>();
private final Map<String, FeedItem> downloadUrls = new HashMap<>();
private final Map<String, FeedItem> identifiers = new HashMap<>();
public FeedItemDuplicateGuesserPool(List<FeedItem> itemsList) {
for (FeedItem item : itemsList) {
add(item);
}
}
public void add(FeedItem item) {
String normalizedTitle = FeedItemDuplicateGuesser.canonicalizeTitle(item.getTitle());
if (!normalizedTitles.containsKey(normalizedTitle)) {
normalizedTitles.put(normalizedTitle, new java.util.ArrayList<>());
}
normalizedTitles.get(normalizedTitle).add(item);
if (item.getMedia() != null && !StringUtils.isEmpty(item.getMedia().getStreamUrl())
&& !downloadUrls.containsKey(item.getMedia().getStreamUrl())) {
downloadUrls.put(item.getMedia().getStreamUrl(), item);
}
if (item.getIdentifyingValue() != null && !identifiers.containsKey(item.getIdentifyingValue())) {
identifiers.put(item.getIdentifyingValue(), item);
}
}
public FeedItem guessDuplicate(FeedItem searchItem) {
if (searchItem.getMedia() != null && !StringUtils.isEmpty(searchItem.getMedia().getStreamUrl())
&& downloadUrls.containsKey(searchItem.getMedia().getStreamUrl())) {
return downloadUrls.get(searchItem.getMedia().getStreamUrl());
}
String normalizedTitle = FeedItemDuplicateGuesser.canonicalizeTitle(searchItem.getTitle());
List<FeedItem> candidates = normalizedTitles.get(normalizedTitle);
if (candidates == null) {
return null;
}
for (FeedItem item : candidates) {
if (FeedItemDuplicateGuesser.seemDuplicates(item, searchItem)) {
return item;
}
}
return null;
}
public FeedItem findById(FeedItem item) {
if (identifiers.containsKey(item.getIdentifyingValue())) {
return identifiers.get(item.getIdentifyingValue());
}
return null;
}
}

View File

@ -1,33 +1,28 @@
package de.danoeh.antennapod.net.download.service.episode.autodownload;
package de.danoeh.antennapod.storage.database;
import android.content.Context;
import androidx.test.platform.app.InstrumentationRegistry;
import de.danoeh.antennapod.model.download.DownloadError;
import de.danoeh.antennapod.model.download.DownloadResult;
import de.danoeh.antennapod.model.feed.Feed;
import de.danoeh.antennapod.model.feed.FeedItem;
import de.danoeh.antennapod.model.feed.FeedItemFilter;
import de.danoeh.antennapod.model.feed.FeedMedia;
import de.danoeh.antennapod.model.feed.SortOrder;
import de.danoeh.antennapod.net.sync.serviceinterface.SynchronizationQueue;
import de.danoeh.antennapod.net.sync.serviceinterface.SynchronizationQueueStub;
import de.danoeh.antennapod.storage.database.DBReader;
import de.danoeh.antennapod.storage.database.DBWriter;
import de.danoeh.antennapod.storage.database.FeedDatabaseWriter;
import de.danoeh.antennapod.storage.database.PodDBAdapter;
import org.junit.After;
import de.danoeh.antennapod.storage.preferences.UserPreferences;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.robolectric.RobolectricTestRunner;
import org.robolectric.RuntimeEnvironment;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import java.util.concurrent.ExecutionException;
import de.danoeh.antennapod.model.feed.Feed;
import de.danoeh.antennapod.model.feed.FeedItem;
import de.danoeh.antennapod.model.feed.FeedMedia;
import de.danoeh.antennapod.storage.preferences.PlaybackPreferences;
import de.danoeh.antennapod.storage.preferences.UserPreferences;
import static java.util.Collections.singletonList;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
@ -35,40 +30,119 @@ import static org.junit.Assert.assertNotSame;
import static org.junit.Assert.assertSame;
import static org.junit.Assert.assertTrue;
/**
* Test class for {@link FeedDatabaseWriter}.
*/
@RunWith(RobolectricTestRunner.class)
public class DbTasksTest {
public class FeedDatabaseWriterTest {
private Context context;
@Before
public void setUp() {
context = InstrumentationRegistry.getInstrumentation().getTargetContext();
context = RuntimeEnvironment.getApplication();
UserPreferences.init(context);
PlaybackPreferences.init(context);
SynchronizationQueue.setInstance(new SynchronizationQueueStub());
// create new database
PodDBAdapter.init(context);
PodDBAdapter.deleteDatabase();
PodDBAdapter adapter = PodDBAdapter.getInstance();
adapter.open();
adapter.close();
SynchronizationQueue.setInstance(new SynchronizationQueueStub());
}
@After
public void tearDown() {
DBWriter.tearDownTests();
PodDBAdapter.tearDownTests();
@Test
public void testStoreNewFeed() {
Feed feed = createFeed();
for (int i = 0; i < 3; i++) {
feed.getItems().add(createItem("item-" + i, "Item " + i, feed));
}
Feed updatedFeed = FeedDatabaseWriter.updateFeed(context, feed, false);
List<FeedItem> storedItems = DBReader.getFeedItemList(updatedFeed, FeedItemFilter.unfiltered(),
SortOrder.EPISODE_TITLE_A_Z, 0, Integer.MAX_VALUE);
assertEquals(3, storedItems.size());
for (int i = 0; i < 3; i++) {
assertEquals("item-" + i, storedItems.get(i).getItemIdentifier());
}
}
@Test
public void testAddItemsToExistingFeed() {
Feed feed = createFeed();
for (int i = 0; i < 3; i++) {
feed.getItems().add(createItem("item-" + i, "Item " + i, feed));
}
feed = FeedDatabaseWriter.updateFeed(context, feed, false);
Feed updatedFeed = createFeed();
updatedFeed.setId(feed.getId());
for (int i = 3; i < 6; i++) {
updatedFeed.getItems().add(createItem("item-" + i, "Item " + i, feed));
}
FeedDatabaseWriter.updateFeed(context, updatedFeed, false);
List<FeedItem> dbItems = DBReader.getFeedItemList(feed, FeedItemFilter.unfiltered(),
SortOrder.EPISODE_TITLE_A_Z, 0, Integer.MAX_VALUE);
assertEquals(6, dbItems.size());
for (int i = 0; i < 6; i++) {
assertEquals("item-" + i, dbItems.get(i).getItemIdentifier());
}
}
@Test
public void testAddOrUpdateItems() throws ExecutionException, InterruptedException {
Feed feed = createFeed();
for (int i = 0; i < 3; i++) {
feed.getItems().add(createItem("item-" + i, "Item " + i, feed));
}
feed = FeedDatabaseWriter.updateFeed(context, feed, false);
DBReader.getFeedItemList(feed, FeedItemFilter.unfiltered(),
SortOrder.EPISODE_TITLE_A_Z, 0, Integer.MAX_VALUE);
DBWriter.markItemPlayed(feed.getItems().get(2), FeedItem.PLAYED, false).get();
Feed updatedFeed = createFeed();
updatedFeed.setId(feed.getId());
for (int i = 2; i < 5; i++) {
updatedFeed.getItems().add(createItem("item-" + i, "Item " + i, feed));
}
FeedDatabaseWriter.updateFeed(context, updatedFeed, false);
List<FeedItem> dbItems = DBReader.getFeedItemList(feed, FeedItemFilter.unfiltered(),
SortOrder.EPISODE_TITLE_A_Z, 0, Integer.MAX_VALUE);
assertEquals(5, dbItems.size());
for (int i = 0; i < 5; i++) {
assertEquals("item-" + i, dbItems.get(i).getItemIdentifier());
}
assertEquals(FeedItem.PLAYED, dbItems.get(2).getPlayState());
}
@Test
public void testDuplicateItemsInFeed() {
Feed feed = createFeed();
feed.getItems().add(createItem("id1", "Duplicate Title", feed));
feed.getItems().add(createItem("id2", "Duplicate Title", feed));
FeedDatabaseWriter.updateFeed(context, feed, false); // First update just takes the feed without complaining
FeedDatabaseWriter.updateFeed(context, feed, false);
List<DownloadResult> downloadLog = DBReader.getDownloadLog();
assertEquals(1, downloadLog.size());
assertEquals(DownloadError.ERROR_PARSER_EXCEPTION_DUPLICATE, downloadLog.get(0).getReason());
}
@Test
public void testGuidUpdated() {
Feed feed = createFeed();
feed.getItems().add(createItem("old-id", "Unique Title", feed));
FeedDatabaseWriter.updateFeed(context, feed, false);
Feed newFeed = createFeed();
newFeed.getItems().add(createItem("new-id", "Unique Title", newFeed));
Feed stored = FeedDatabaseWriter.updateFeed(context, newFeed, false);
assertEquals(1, stored.getItems().size());
assertEquals("new-id", stored.getItems().get(0).getItemIdentifier());
}
@Test
public void testUpdateFeedNewFeed() {
final int numItems = 10;
Feed feed = new Feed("url", null, "title");
feed.setItems(new ArrayList<>());
Feed feed = createFeed();
for (int i = 0; i < numItems; i++) {
feed.getItems().add(new FeedItem(0, "item " + i, "id " + i, "link " + i,
new Date(), FeedItem.UNPLAYED, feed));
@ -86,12 +160,9 @@ public class DbTasksTest {
/** Two feeds with the same title, but different download URLs should be treated as different feeds. */
@Test
public void testUpdateFeedSameTitle() {
Feed feed1 = new Feed("url1", null, "title");
Feed feed2 = new Feed("url2", null, "title");
feed1.setItems(new ArrayList<>());
feed2.setItems(new ArrayList<>());
Feed feed1 = createFeed();
Feed feed2 = createFeed();
feed2.setDownloadUrl("different url");
Feed savedFeed1 = FeedDatabaseWriter.updateFeed(context, feed1, false);
Feed savedFeed2 = FeedDatabaseWriter.updateFeed(context, feed2, false);
@ -104,8 +175,7 @@ public class DbTasksTest {
final int numItemsOld = 10;
final int numItemsNew = 10;
final Feed feed = new Feed("url", null, "title");
feed.setItems(new ArrayList<>());
final Feed feed = createFeed();
for (int i = 0; i < numItemsOld; i++) {
feed.getItems().add(new FeedItem(0, "item " + i, "id " + i, "link " + i,
new Date(i), FeedItem.PLAYED, feed));
@ -144,9 +214,9 @@ public class DbTasksTest {
@Test
public void testUpdateFeedMediaUrlResetState() {
final Feed feed = new Feed("url", null, "title");
final Feed feed = createFeed();
FeedItem item = new FeedItem(0, "item", "id", "link", new Date(), FeedItem.PLAYED, feed);
feed.setItems(singletonList(item));
feed.setItems(Collections.singletonList(item));
PodDBAdapter adapter = PodDBAdapter.getInstance();
adapter.open();
@ -173,8 +243,7 @@ public class DbTasksTest {
@Test
public void testUpdateFeedRemoveUnlistedItems() {
final Feed feed = new Feed("url", null, "title");
feed.setItems(new ArrayList<>());
final Feed feed = createFeed();
for (int i = 0; i < 10; i++) {
feed.getItems().add(
new FeedItem(0, "item " + i, "id " + i, "link " + i, new Date(i), FeedItem.PLAYED, feed));
@ -195,8 +264,7 @@ public class DbTasksTest {
@Test
public void testUpdateFeedSetDuplicate() {
final Feed feed = new Feed("url", null, "title");
feed.setItems(new ArrayList<>());
final Feed feed = createFeed();
for (int i = 0; i < 10; i++) {
FeedItem item =
new FeedItem(0, "item " + i, "id " + i, "link " + i, new Date(i), FeedItem.PLAYED, feed);
@ -249,4 +317,19 @@ public class DbTasksTest {
lastDate = item.getPubDate();
}
}
private Feed createFeed() {
Feed feed = new Feed("url", null, null);
feed.setItems(new ArrayList<>());
return feed;
}
private FeedItem createItem(String identifier, String title, Feed feed) {
FeedItem item = new FeedItem();
item.setItemIdentifier(identifier);
item.setTitle(title);
item.setMedia(new FeedMedia(item, "url-" + title, 2, "mime"));
item.setFeed(feed);
return item;
}
}

View File

@ -0,0 +1,40 @@
package de.danoeh.antennapod.storage.database;
import de.danoeh.antennapod.model.feed.Feed;
import de.danoeh.antennapod.model.feed.FeedItem;
import de.danoeh.antennapod.model.feed.FeedMedia;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import java.util.ArrayList;
import static org.junit.Assert.assertSame;
@RunWith(JUnit4.class)
public class FeedItemDuplicateGuesserPoolTest {
@Test
public void testDuplicateIsConsistent() {
Feed feed = new Feed("url", null, null);
FeedItem item1 = createItem("id1", "Title", feed);
FeedItem item2 = createItem("id2", "Title", feed);
FeedItemDuplicateGuesserPool pool = new FeedItemDuplicateGuesserPool(new ArrayList<>());
pool.add(item1);
assertSame(item1, pool.guessDuplicate(item1));
assertSame(item1, pool.guessDuplicate(item2));
pool.add(item2);
assertSame(item1, pool.guessDuplicate(item1));
assertSame(item1, pool.guessDuplicate(item2));
}
private FeedItem createItem(String identifier, String title, Feed feed) {
FeedItem item = new FeedItem();
item.setItemIdentifier(identifier);
item.setTitle(title);
item.setMedia(new FeedMedia(item, "url-" + title, 2, "mime"));
item.setFeed(feed);
return item;
}
}