From 0772b4998d288ad444242710260ad22a43eb6388 Mon Sep 17 00:00:00 2001 From: Hans-Peter Lehmann Date: Fri, 12 Sep 2025 21:00:56 +0200 Subject: [PATCH] Fix and tune feed item duplicate guesser (#7979) --- .../storage/database/FeedDatabaseWriter.java | 45 +---- .../database/FeedItemDuplicateGuesser.java | 2 +- .../FeedItemDuplicateGuesserPool.java | 60 ++++++ .../database/FeedDatabaseWriterTest.java | 173 +++++++++++++----- .../FeedItemDuplicateGuesserPoolTest.java | 40 ++++ 5 files changed, 237 insertions(+), 83 deletions(-) create mode 100644 storage/database/src/main/java/de/danoeh/antennapod/storage/database/FeedItemDuplicateGuesserPool.java rename net/download/service/src/test/java/de/danoeh/antennapod/net/download/service/episode/autodownload/DbTasksTest.java => storage/database/src/test/java/de/danoeh/antennapod/storage/database/FeedDatabaseWriterTest.java (61%) create mode 100644 storage/database/src/test/java/de/danoeh/antennapod/storage/database/FeedItemDuplicateGuesserPoolTest.java diff --git a/storage/database/src/main/java/de/danoeh/antennapod/storage/database/FeedDatabaseWriter.java b/storage/database/src/main/java/de/danoeh/antennapod/storage/database/FeedDatabaseWriter.java index c00abb157..4b0654a52 100644 --- a/storage/database/src/main/java/de/danoeh/antennapod/storage/database/FeedDatabaseWriter.java +++ b/storage/database/src/main/java/de/danoeh/antennapod/storage/database/FeedDatabaseWriter.java @@ -1,7 +1,6 @@ package de.danoeh.antennapod.storage.database; import android.content.Context; -import android.text.TextUtils; import android.util.Log; import de.danoeh.antennapod.event.FeedListUpdateEvent; import de.danoeh.antennapod.model.download.DownloadError; @@ -45,38 +44,6 @@ public abstract class FeedDatabaseWriter { return null; } - /** - * Get a FeedItem by its identifying value. - */ - private static FeedItem searchFeedItemByIdentifyingValue(List items, FeedItem searchItem) { - for (FeedItem item : items) { - if (TextUtils.equals(item.getIdentifyingValue(), searchItem.getIdentifyingValue())) { - return item; - } - } - return null; - } - - /** - * Guess if one of the items could actually mean the searched item, even if it uses another identifying value. - * This is to work around podcasters breaking their GUIDs. - */ - private static FeedItem searchFeedItemGuessDuplicate(List items, FeedItem searchItem) { - // First, see if it is a well-behaving feed that contains an item with the same identifier - for (FeedItem item : items) { - if (FeedItemDuplicateGuesser.sameAndNotEmpty(item.getItemIdentifier(), searchItem.getItemIdentifier())) { - return item; - } - } - // Not found yet, start more expensive guessing - for (FeedItem item : items) { - if (FeedItemDuplicateGuesser.seemDuplicates(item, searchItem)) { - return item; - } - } - return null; - } - /** * Adds new Feeds to the database or updates the old versions if they already exists. If another Feed with the same * identifying value already exists, this method will add new FeedItems from the new Feed to the existing Feed. @@ -108,6 +75,9 @@ public abstract class FeedDatabaseWriter { + " already exists. Syncing new with existing one."); Collections.sort(newFeed.getItems(), new FeedItemPubdateComparator()); + FeedItemDuplicateGuesserPool newFeedDuplicateGuesser = new FeedItemDuplicateGuesserPool(newFeed.getItems()); + FeedItemDuplicateGuesserPool savedFeedDuplicateGuesser + = new FeedItemDuplicateGuesserPool(savedFeed.getItems()); if (newFeed.getPageNr() == savedFeed.getPageNr()) { savedFeed.updateFromOther(newFeed); @@ -128,7 +98,7 @@ public abstract class FeedDatabaseWriter { for (int idx = 0; idx < newFeed.getItems().size(); idx++) { final FeedItem item = newFeed.getItems().get(idx); - FeedItem possibleDuplicate = searchFeedItemGuessDuplicate(newFeed.getItems(), item); + FeedItem possibleDuplicate = newFeedDuplicateGuesser.guessDuplicate(item); if (!newFeed.isLocalFeed() && possibleDuplicate != null && item != possibleDuplicate) { // Canonical episode is the first one returned (usually oldest) DBWriter.addDownloadStatus(new DownloadResult(item.getTitle(), @@ -142,9 +112,9 @@ public abstract class FeedDatabaseWriter { continue; } - FeedItem oldItem = searchFeedItemByIdentifyingValue(savedFeed.getItems(), item); + FeedItem oldItem = savedFeedDuplicateGuesser.findById(item); if (!newFeed.isLocalFeed() && oldItem == null) { - oldItem = searchFeedItemGuessDuplicate(savedFeed.getItems(), item); + oldItem = savedFeedDuplicateGuesser.guessDuplicate(item); if (oldItem != null) { Log.d(TAG, "Repaired duplicate: " + oldItem + ", " + item); DBWriter.addDownloadStatus(new DownloadResult(item.getTitle(), @@ -181,6 +151,7 @@ public abstract class FeedDatabaseWriter { } else { savedFeed.getItems().add(idx, item); } + savedFeedDuplicateGuesser.add(item); boolean shouldPerformNewEpisodesAction = item.getPubDate() == null || priorMostRecentDate == null @@ -217,7 +188,7 @@ public abstract class FeedDatabaseWriter { Iterator it = savedFeed.getItems().iterator(); while (it.hasNext()) { FeedItem feedItem = it.next(); - if (searchFeedItemByIdentifyingValue(newFeed.getItems(), feedItem) == null) { + if (newFeedDuplicateGuesser.findById(feedItem) == null) { unlistedItems.add(feedItem); it.remove(); } diff --git a/storage/database/src/main/java/de/danoeh/antennapod/storage/database/FeedItemDuplicateGuesser.java b/storage/database/src/main/java/de/danoeh/antennapod/storage/database/FeedItemDuplicateGuesser.java index 19a2bd5f1..832993ccd 100644 --- a/storage/database/src/main/java/de/danoeh/antennapod/storage/database/FeedItemDuplicateGuesser.java +++ b/storage/database/src/main/java/de/danoeh/antennapod/storage/database/FeedItemDuplicateGuesser.java @@ -69,7 +69,7 @@ public class FeedItemDuplicateGuesser { return sameAndNotEmpty(canonicalizeTitle(item1.getTitle()), canonicalizeTitle(item2.getTitle())); } - private static String canonicalizeTitle(String title) { + public static String canonicalizeTitle(String title) { if (title == null) { return ""; } diff --git a/storage/database/src/main/java/de/danoeh/antennapod/storage/database/FeedItemDuplicateGuesserPool.java b/storage/database/src/main/java/de/danoeh/antennapod/storage/database/FeedItemDuplicateGuesserPool.java new file mode 100644 index 000000000..521baa3d6 --- /dev/null +++ b/storage/database/src/main/java/de/danoeh/antennapod/storage/database/FeedItemDuplicateGuesserPool.java @@ -0,0 +1,60 @@ +package de.danoeh.antennapod.storage.database; + +import de.danoeh.antennapod.model.feed.FeedItem; +import org.apache.commons.lang3.StringUtils; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class FeedItemDuplicateGuesserPool { + private final Map> normalizedTitles = new HashMap<>(); + private final Map downloadUrls = new HashMap<>(); + private final Map identifiers = new HashMap<>(); + + public FeedItemDuplicateGuesserPool(List itemsList) { + for (FeedItem item : itemsList) { + add(item); + } + } + + public void add(FeedItem item) { + String normalizedTitle = FeedItemDuplicateGuesser.canonicalizeTitle(item.getTitle()); + if (!normalizedTitles.containsKey(normalizedTitle)) { + normalizedTitles.put(normalizedTitle, new java.util.ArrayList<>()); + } + normalizedTitles.get(normalizedTitle).add(item); + if (item.getMedia() != null && !StringUtils.isEmpty(item.getMedia().getStreamUrl()) + && !downloadUrls.containsKey(item.getMedia().getStreamUrl())) { + downloadUrls.put(item.getMedia().getStreamUrl(), item); + } + if (item.getIdentifyingValue() != null && !identifiers.containsKey(item.getIdentifyingValue())) { + identifiers.put(item.getIdentifyingValue(), item); + } + } + + public FeedItem guessDuplicate(FeedItem searchItem) { + if (searchItem.getMedia() != null && !StringUtils.isEmpty(searchItem.getMedia().getStreamUrl()) + && downloadUrls.containsKey(searchItem.getMedia().getStreamUrl())) { + return downloadUrls.get(searchItem.getMedia().getStreamUrl()); + } + String normalizedTitle = FeedItemDuplicateGuesser.canonicalizeTitle(searchItem.getTitle()); + List candidates = normalizedTitles.get(normalizedTitle); + if (candidates == null) { + return null; + } + for (FeedItem item : candidates) { + if (FeedItemDuplicateGuesser.seemDuplicates(item, searchItem)) { + return item; + } + } + return null; + } + + public FeedItem findById(FeedItem item) { + if (identifiers.containsKey(item.getIdentifyingValue())) { + return identifiers.get(item.getIdentifyingValue()); + } + return null; + } +} diff --git a/net/download/service/src/test/java/de/danoeh/antennapod/net/download/service/episode/autodownload/DbTasksTest.java b/storage/database/src/test/java/de/danoeh/antennapod/storage/database/FeedDatabaseWriterTest.java similarity index 61% rename from net/download/service/src/test/java/de/danoeh/antennapod/net/download/service/episode/autodownload/DbTasksTest.java rename to storage/database/src/test/java/de/danoeh/antennapod/storage/database/FeedDatabaseWriterTest.java index c3298817a..8b582d1d1 100644 --- a/net/download/service/src/test/java/de/danoeh/antennapod/net/download/service/episode/autodownload/DbTasksTest.java +++ b/storage/database/src/test/java/de/danoeh/antennapod/storage/database/FeedDatabaseWriterTest.java @@ -1,33 +1,28 @@ -package de.danoeh.antennapod.net.download.service.episode.autodownload; +package de.danoeh.antennapod.storage.database; import android.content.Context; - -import androidx.test.platform.app.InstrumentationRegistry; - +import de.danoeh.antennapod.model.download.DownloadError; +import de.danoeh.antennapod.model.download.DownloadResult; +import de.danoeh.antennapod.model.feed.Feed; +import de.danoeh.antennapod.model.feed.FeedItem; +import de.danoeh.antennapod.model.feed.FeedItemFilter; +import de.danoeh.antennapod.model.feed.FeedMedia; +import de.danoeh.antennapod.model.feed.SortOrder; import de.danoeh.antennapod.net.sync.serviceinterface.SynchronizationQueue; import de.danoeh.antennapod.net.sync.serviceinterface.SynchronizationQueueStub; -import de.danoeh.antennapod.storage.database.DBReader; -import de.danoeh.antennapod.storage.database.DBWriter; -import de.danoeh.antennapod.storage.database.FeedDatabaseWriter; -import de.danoeh.antennapod.storage.database.PodDBAdapter; -import org.junit.After; +import de.danoeh.antennapod.storage.preferences.UserPreferences; import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; import org.robolectric.RobolectricTestRunner; +import org.robolectric.RuntimeEnvironment; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.List; +import java.util.concurrent.ExecutionException; -import de.danoeh.antennapod.model.feed.Feed; -import de.danoeh.antennapod.model.feed.FeedItem; -import de.danoeh.antennapod.model.feed.FeedMedia; -import de.danoeh.antennapod.storage.preferences.PlaybackPreferences; -import de.danoeh.antennapod.storage.preferences.UserPreferences; - -import static java.util.Collections.singletonList; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; @@ -35,40 +30,119 @@ import static org.junit.Assert.assertNotSame; import static org.junit.Assert.assertSame; import static org.junit.Assert.assertTrue; -/** - * Test class for {@link FeedDatabaseWriter}. - */ @RunWith(RobolectricTestRunner.class) -public class DbTasksTest { +public class FeedDatabaseWriterTest { private Context context; @Before public void setUp() { - context = InstrumentationRegistry.getInstrumentation().getTargetContext(); + context = RuntimeEnvironment.getApplication(); UserPreferences.init(context); - PlaybackPreferences.init(context); - SynchronizationQueue.setInstance(new SynchronizationQueueStub()); - - // create new database PodDBAdapter.init(context); PodDBAdapter.deleteDatabase(); PodDBAdapter adapter = PodDBAdapter.getInstance(); adapter.open(); adapter.close(); + SynchronizationQueue.setInstance(new SynchronizationQueueStub()); } - @After - public void tearDown() { - DBWriter.tearDownTests(); - PodDBAdapter.tearDownTests(); + @Test + public void testStoreNewFeed() { + Feed feed = createFeed(); + for (int i = 0; i < 3; i++) { + feed.getItems().add(createItem("item-" + i, "Item " + i, feed)); + } + Feed updatedFeed = FeedDatabaseWriter.updateFeed(context, feed, false); + List storedItems = DBReader.getFeedItemList(updatedFeed, FeedItemFilter.unfiltered(), + SortOrder.EPISODE_TITLE_A_Z, 0, Integer.MAX_VALUE); + assertEquals(3, storedItems.size()); + for (int i = 0; i < 3; i++) { + assertEquals("item-" + i, storedItems.get(i).getItemIdentifier()); + } + } + + @Test + public void testAddItemsToExistingFeed() { + Feed feed = createFeed(); + for (int i = 0; i < 3; i++) { + feed.getItems().add(createItem("item-" + i, "Item " + i, feed)); + } + feed = FeedDatabaseWriter.updateFeed(context, feed, false); + + Feed updatedFeed = createFeed(); + updatedFeed.setId(feed.getId()); + for (int i = 3; i < 6; i++) { + updatedFeed.getItems().add(createItem("item-" + i, "Item " + i, feed)); + } + FeedDatabaseWriter.updateFeed(context, updatedFeed, false); + + List dbItems = DBReader.getFeedItemList(feed, FeedItemFilter.unfiltered(), + SortOrder.EPISODE_TITLE_A_Z, 0, Integer.MAX_VALUE); + assertEquals(6, dbItems.size()); + for (int i = 0; i < 6; i++) { + assertEquals("item-" + i, dbItems.get(i).getItemIdentifier()); + } + } + + @Test + public void testAddOrUpdateItems() throws ExecutionException, InterruptedException { + Feed feed = createFeed(); + for (int i = 0; i < 3; i++) { + feed.getItems().add(createItem("item-" + i, "Item " + i, feed)); + } + feed = FeedDatabaseWriter.updateFeed(context, feed, false); + DBReader.getFeedItemList(feed, FeedItemFilter.unfiltered(), + SortOrder.EPISODE_TITLE_A_Z, 0, Integer.MAX_VALUE); + DBWriter.markItemPlayed(feed.getItems().get(2), FeedItem.PLAYED, false).get(); + + Feed updatedFeed = createFeed(); + updatedFeed.setId(feed.getId()); + for (int i = 2; i < 5; i++) { + updatedFeed.getItems().add(createItem("item-" + i, "Item " + i, feed)); + } + FeedDatabaseWriter.updateFeed(context, updatedFeed, false); + + List dbItems = DBReader.getFeedItemList(feed, FeedItemFilter.unfiltered(), + SortOrder.EPISODE_TITLE_A_Z, 0, Integer.MAX_VALUE); + assertEquals(5, dbItems.size()); + for (int i = 0; i < 5; i++) { + assertEquals("item-" + i, dbItems.get(i).getItemIdentifier()); + } + assertEquals(FeedItem.PLAYED, dbItems.get(2).getPlayState()); + } + + @Test + public void testDuplicateItemsInFeed() { + Feed feed = createFeed(); + feed.getItems().add(createItem("id1", "Duplicate Title", feed)); + feed.getItems().add(createItem("id2", "Duplicate Title", feed)); + FeedDatabaseWriter.updateFeed(context, feed, false); // First update just takes the feed without complaining + FeedDatabaseWriter.updateFeed(context, feed, false); + + List downloadLog = DBReader.getDownloadLog(); + assertEquals(1, downloadLog.size()); + assertEquals(DownloadError.ERROR_PARSER_EXCEPTION_DUPLICATE, downloadLog.get(0).getReason()); + } + + @Test + public void testGuidUpdated() { + Feed feed = createFeed(); + feed.getItems().add(createItem("old-id", "Unique Title", feed)); + FeedDatabaseWriter.updateFeed(context, feed, false); + + Feed newFeed = createFeed(); + newFeed.getItems().add(createItem("new-id", "Unique Title", newFeed)); + Feed stored = FeedDatabaseWriter.updateFeed(context, newFeed, false); + + assertEquals(1, stored.getItems().size()); + assertEquals("new-id", stored.getItems().get(0).getItemIdentifier()); } @Test public void testUpdateFeedNewFeed() { final int numItems = 10; - Feed feed = new Feed("url", null, "title"); - feed.setItems(new ArrayList<>()); + Feed feed = createFeed(); for (int i = 0; i < numItems; i++) { feed.getItems().add(new FeedItem(0, "item " + i, "id " + i, "link " + i, new Date(), FeedItem.UNPLAYED, feed)); @@ -86,12 +160,9 @@ public class DbTasksTest { /** Two feeds with the same title, but different download URLs should be treated as different feeds. */ @Test public void testUpdateFeedSameTitle() { - - Feed feed1 = new Feed("url1", null, "title"); - Feed feed2 = new Feed("url2", null, "title"); - - feed1.setItems(new ArrayList<>()); - feed2.setItems(new ArrayList<>()); + Feed feed1 = createFeed(); + Feed feed2 = createFeed(); + feed2.setDownloadUrl("different url"); Feed savedFeed1 = FeedDatabaseWriter.updateFeed(context, feed1, false); Feed savedFeed2 = FeedDatabaseWriter.updateFeed(context, feed2, false); @@ -104,8 +175,7 @@ public class DbTasksTest { final int numItemsOld = 10; final int numItemsNew = 10; - final Feed feed = new Feed("url", null, "title"); - feed.setItems(new ArrayList<>()); + final Feed feed = createFeed(); for (int i = 0; i < numItemsOld; i++) { feed.getItems().add(new FeedItem(0, "item " + i, "id " + i, "link " + i, new Date(i), FeedItem.PLAYED, feed)); @@ -144,9 +214,9 @@ public class DbTasksTest { @Test public void testUpdateFeedMediaUrlResetState() { - final Feed feed = new Feed("url", null, "title"); + final Feed feed = createFeed(); FeedItem item = new FeedItem(0, "item", "id", "link", new Date(), FeedItem.PLAYED, feed); - feed.setItems(singletonList(item)); + feed.setItems(Collections.singletonList(item)); PodDBAdapter adapter = PodDBAdapter.getInstance(); adapter.open(); @@ -173,8 +243,7 @@ public class DbTasksTest { @Test public void testUpdateFeedRemoveUnlistedItems() { - final Feed feed = new Feed("url", null, "title"); - feed.setItems(new ArrayList<>()); + final Feed feed = createFeed(); for (int i = 0; i < 10; i++) { feed.getItems().add( new FeedItem(0, "item " + i, "id " + i, "link " + i, new Date(i), FeedItem.PLAYED, feed)); @@ -195,8 +264,7 @@ public class DbTasksTest { @Test public void testUpdateFeedSetDuplicate() { - final Feed feed = new Feed("url", null, "title"); - feed.setItems(new ArrayList<>()); + final Feed feed = createFeed(); for (int i = 0; i < 10; i++) { FeedItem item = new FeedItem(0, "item " + i, "id " + i, "link " + i, new Date(i), FeedItem.PLAYED, feed); @@ -249,4 +317,19 @@ public class DbTasksTest { lastDate = item.getPubDate(); } } + + private Feed createFeed() { + Feed feed = new Feed("url", null, null); + feed.setItems(new ArrayList<>()); + return feed; + } + + private FeedItem createItem(String identifier, String title, Feed feed) { + FeedItem item = new FeedItem(); + item.setItemIdentifier(identifier); + item.setTitle(title); + item.setMedia(new FeedMedia(item, "url-" + title, 2, "mime")); + item.setFeed(feed); + return item; + } } diff --git a/storage/database/src/test/java/de/danoeh/antennapod/storage/database/FeedItemDuplicateGuesserPoolTest.java b/storage/database/src/test/java/de/danoeh/antennapod/storage/database/FeedItemDuplicateGuesserPoolTest.java new file mode 100644 index 000000000..bdb0bf717 --- /dev/null +++ b/storage/database/src/test/java/de/danoeh/antennapod/storage/database/FeedItemDuplicateGuesserPoolTest.java @@ -0,0 +1,40 @@ +package de.danoeh.antennapod.storage.database; + +import de.danoeh.antennapod.model.feed.Feed; +import de.danoeh.antennapod.model.feed.FeedItem; +import de.danoeh.antennapod.model.feed.FeedMedia; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +import java.util.ArrayList; + +import static org.junit.Assert.assertSame; + +@RunWith(JUnit4.class) +public class FeedItemDuplicateGuesserPoolTest { + + @Test + public void testDuplicateIsConsistent() { + Feed feed = new Feed("url", null, null); + FeedItem item1 = createItem("id1", "Title", feed); + FeedItem item2 = createItem("id2", "Title", feed); + + FeedItemDuplicateGuesserPool pool = new FeedItemDuplicateGuesserPool(new ArrayList<>()); + pool.add(item1); + assertSame(item1, pool.guessDuplicate(item1)); + assertSame(item1, pool.guessDuplicate(item2)); + pool.add(item2); + assertSame(item1, pool.guessDuplicate(item1)); + assertSame(item1, pool.guessDuplicate(item2)); + } + + private FeedItem createItem(String identifier, String title, Feed feed) { + FeedItem item = new FeedItem(); + item.setItemIdentifier(identifier); + item.setTitle(title); + item.setMedia(new FeedMedia(item, "url-" + title, 2, "mime")); + item.setFeed(feed); + return item; + } +}