From 7b19910e0b05165db672a98b92a9e319e6268efb Mon Sep 17 00:00:00 2001 From: igrekster Date: Tue, 22 Feb 2022 18:55:30 +1100 Subject: [PATCH] Remove duplicates from downloaded message list (#650) Some downloaded feeds contain multiple entries for the same message. See `http://feeds.feedburner.com/abseilio` for an example that has: ``` 2020-09-11T00:00:00-04:00 https://abseil.io/tips/5 -- 2020-06-01T00:00:00-04:00 https://abseil.io/tips/5 ``` When updating the database for the first time, both messages end up stored. The following feed updates result in one entry matching the database, and the second entry having a different creation date. This makes the second entry always marked as unread. --- src/librssguard/core/feeddownloader.cpp | 55 +++++++++++++++++++++++++ src/librssguard/core/feeddownloader.h | 1 + 2 files changed, 56 insertions(+) diff --git a/src/librssguard/core/feeddownloader.cpp b/src/librssguard/core/feeddownloader.cpp index a79e8d168..d8363c0f0 100644 --- a/src/librssguard/core/feeddownloader.cpp +++ b/src/librssguard/core/feeddownloader.cpp @@ -348,6 +348,8 @@ void FeedDownloader::updateOneFeed(ServiceRoot* acc, } } + removeDuplicateMessages(msgs); + // Now make sure, that messages are actually stored to SQL in a locked state. qDebugNN << LOGSEC_FEEDDOWNLOADER << "Saving messages of feed ID '" << feed->customId() << "' URL: '" << feed->source() << "' title: '" << feed->title() << "' in thread: '" @@ -418,6 +420,59 @@ bool FeedDownloader::isCacheSynchronizationRunning() const { return m_isCacheSynchronizationRunning; } +void FeedDownloader::removeDuplicateMessages(QList& messages) { + auto idx = 0; + while (idx < messages.size()) { + Message& message = messages[idx]; + std::function is_duplicate; + if (message.m_id > 0) { + is_duplicate = [](const Message& a, const Message& b) { + return a.m_id == b.m_id; + }; + } + else if (message.m_customId.isEmpty()) { + is_duplicate = [](const Message& a, const Message& b) { + return std::tie(a.m_title, a.m_url, a.m_author) == std::tie(b.m_title, b.m_url, b.m_author); + }; + } + else { + is_duplicate = [](const Message& a, const Message& b) { + return a.m_customId == b.m_customId; + }; + } + auto next_idx = idx + 1; // Index of next message to check after removing all duplicates. + auto last_idx = idx; // Index of the last kept duplicate. + idx = next_idx; + + // Remove all duplicate messages, and keep the message with the latest created date. + // If the created date is identical for all duplicate messages then keep the last message in the list. + while (idx < messages.size()) { + auto& last_duplicate = messages[last_idx]; + if (is_duplicate(last_duplicate, messages[idx])) { + if (last_duplicate.m_created <= messages[idx].m_created) { + // The last seen message was created earlier or at the same date -- keep the current, and remove the last. + messages.removeAt(last_idx); + if (last_idx + 1 == next_idx) { + // The `next_idx` was pointing to the message following the duplicate. With that duplicate removed the + // next index needs to be adjusted. + next_idx = last_idx; + } + last_idx = idx; + ++idx; + } + else { + messages.removeAt(idx); + } + } + else { + ++idx; + } + } + + idx = next_idx; + } +} + QString FeedDownloadResults::overview(int how_many_feeds) const { QStringList result; diff --git a/src/librssguard/core/feeddownloader.h b/src/librssguard/core/feeddownloader.h index 7687c14a3..7bed58667 100644 --- a/src/librssguard/core/feeddownloader.h +++ b/src/librssguard/core/feeddownloader.h @@ -59,6 +59,7 @@ class FeedDownloader : public QObject { const QHash& stated_messages, const QHash& tagged_messages); void finalizeUpdate(); + static void removeDuplicateMessages(QList& messages); bool m_isCacheSynchronizationRunning; bool m_stopCacheSynchronization;