diff --git a/resources/text/CHANGELOG b/resources/text/CHANGELOG index 9af05e50a..1badda172 100755 --- a/resources/text/CHANGELOG +++ b/resources/text/CHANGELOG @@ -5,6 +5,7 @@ Added: ▪ Auto-update status of feeds is now more general and complete. ( Changed: +▪ Better ATOM parsing, respects now XML namespaces, bit better link parsing and other stuff. (issue #104) ▪ Folder which holds SQL scripts got renamed to "sql". ▪ Tweaked some conditions for determining newly "updated" messages in ATOM format. (issue #103) diff --git a/rssguard.pro b/rssguard.pro index 8110e4b79..802513612 100755 --- a/rssguard.pro +++ b/rssguard.pro @@ -319,9 +319,10 @@ HEADERS += src/core/feeddownloader.h \ src/gui/settings/settingsfeedsmessages.h \ src/gui/settings/settingsdownloads.h \ src/miscellaneous/feedreader.h \ - src/core/atomparser.h \ - src/core/rssparser.h \ - src/core/rdfparser.h + src/services/standard/atomparser.h \ + src/services/standard/feedparser.h \ + src/services/standard/rdfparser.h \ + src/services/standard/rssparser.h SOURCES += src/core/feeddownloader.cpp \ src/core/feedsmodel.cpp \ @@ -436,9 +437,10 @@ SOURCES += src/core/feeddownloader.cpp \ src/gui/settings/settingsfeedsmessages.cpp \ src/gui/settings/settingsdownloads.cpp \ src/miscellaneous/feedreader.cpp \ - src/core/atomparser.cpp \ - src/core/rssparser.cpp \ - src/core/rdfparser.cpp + src/services/standard/atomparser.cpp \ + src/services/standard/feedparser.cpp \ + src/services/standard/rdfparser.cpp \ + src/services/standard/rssparser.cpp FORMS += src/gui/toolbareditor.ui \ src/network-web/downloaditem.ui \ diff --git a/src/core/atomparser.cpp b/src/core/atomparser.cpp deleted file mode 100644 index 468dfc229..000000000 --- a/src/core/atomparser.cpp +++ /dev/null @@ -1,120 +0,0 @@ -// This file is part of RSS Guard. -// -// Copyright (C) 2011-2017 by Martin Rotter -// -// RSS Guard is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RSS Guard is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RSS Guard. If not, see . - -#include "core/atomparser.h" - -#include "miscellaneous/textfactory.h" -#include "network-web/webfactory.h" - -#include - - -AtomParser::AtomParser() { -} - -AtomParser::~AtomParser() { -} - -QList AtomParser::parseXmlData(const QString &data) { - QList messages; - QDomDocument xml_file; - QDateTime current_time = QDateTime::currentDateTime(); - const QString atom_ns = QSL("http://www.w3.org/2005/Atom"); - - xml_file.setContent(data, true); - - // Pull out all messages. - QDomNodeList messages_in_xml = xml_file.elementsByTagName(QSL("entry")); - - for (int i = 0; i < messages_in_xml.size(); i++) { - QDomNode message_item = messages_in_xml.item(i); - Message new_message; - - // Deal with titles & descriptions. - QString elem_title = message_item.namedItem(QSL("title")).toElement().text().simplified(); - QString elem_summary = message_item.namedItem(QSL("summary")).toElement().text(); - - if (elem_summary.isEmpty()) { - elem_summary = message_item.namedItem(QSL("content")).toElement().text(); - } - - // Now we obtained maximum of information for title & description. - if (elem_title.isEmpty()) { - if (elem_summary.isEmpty()) { - // BOTH title and description are empty, skip this message. - continue; - } - else { - // Title is empty but description is not. - new_message.m_title = WebFactory::instance()->stripTags(elem_summary.simplified()); - new_message.m_contents = elem_summary; - } - } - else { - // Title is not empty, description does not matter. - new_message.m_title = WebFactory::instance()->stripTags(elem_title); - new_message.m_contents = elem_summary; - } - - // Deal with link. - QDomNodeList elem_links = message_item.toElement().elementsByTagName(QSL("link")); - - for (int i = 0; i < elem_links.size(); i++) { - QDomElement link = elem_links.at(i).toElement(); - - if (link.attribute(QSL("rel")) == QSL("enclosure")) { - new_message.m_enclosures.append(Enclosure(link.attribute(QSL("href")), link.attribute(QSL("type")))); - - qDebug("Adding enclosure '%s' for the message.", qPrintable(new_message.m_enclosures.last().m_url)); - } - else { - new_message.m_url = link.attribute(QSL("href")); - } - } - - if (new_message.m_url.isEmpty() && !new_message.m_enclosures.isEmpty()) { - new_message.m_url = new_message.m_enclosures.first().m_url; - } - - // Deal with authors. - new_message.m_author = WebFactory::instance()->escapeHtml(message_item.namedItem(QSL("author")).namedItem(QSL("name")).toElement().text()); - - // Deal with creation date. - new_message.m_created = TextFactory::parseDateTime(message_item.namedItem(QSL("updated")).toElement().text()); - new_message.m_createdFromFeed = !new_message.m_created.isNull(); - - if (!new_message.m_createdFromFeed) { - // Date was NOT obtained from the feed, set current date as creation date for the message. - new_message.m_created = current_time; - } - - // WARNING: There is a difference between "" and QString() in terms of nullptr SQL values! - // This is because of difference in QString::isNull() and QString::isEmpty(), the "" is not null - // while QString() is. - if (new_message.m_author.isNull()) { - new_message.m_author = ""; - } - - if (new_message.m_url.isNull()) { - new_message.m_url = ""; - } - - messages.append(new_message); - } - - return messages; -} diff --git a/src/services/standard/atomparser.cpp b/src/services/standard/atomparser.cpp new file mode 100644 index 000000000..ddb84bd70 --- /dev/null +++ b/src/services/standard/atomparser.cpp @@ -0,0 +1,129 @@ +// This file is part of RSS Guard. +// +// Copyright (C) 2011-2017 by Martin Rotter +// +// RSS Guard is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// RSS Guard is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with RSS Guard. If not, see . + +#include "services/standard/atomparser.h" + +#include "miscellaneous/textfactory.h" +#include "network-web/webfactory.h" + +#include "exceptions/applicationexception.h" + + +AtomParser::AtomParser(const QString &data) : FeedParser(data), m_atomNamespace(QSL("http://www.w3.org/2005/Atom")) { +} + +AtomParser::~AtomParser() { +} + +QString AtomParser::feedAuthor() const { + QDomNodeList authors = m_xml.documentElement().elementsByTagNameNS(m_atomNamespace, QSL("author")); + QStringList author_str; + + for (int i = 0; i < authors.size(); i++) { + QDomNodeList names = authors.at(i).toElement().elementsByTagNameNS(m_atomNamespace, QSL("name")); + + if (!names.isEmpty()) { + author_str.append(names.at(0).toElement().text()); + } + } + + return author_str.join(", "); +} + +Message AtomParser::extractMessage(const QDomElement &msg_element, QDateTime current_time) const { + Message new_message; + QString title = textsFromPath(msg_element, m_atomNamespace, QSL("title"), true).join(QSL(", ")); + QString summary = textsFromPath(msg_element, m_atomNamespace, QSL("content"), true).join(QSL(", ")); + + if (summary.isEmpty()) { + summary = textsFromPath(msg_element, m_atomNamespace, QSL("summary"), true).join(QSL(", ")); + } + + // Now we obtained maximum of information for title & description. + if (title.isEmpty() && summary.isEmpty()) { + // BOTH title and description are empty, skip this message. + throw new ApplicationException(QSL("Not enough data for the message.")); + } + + // Title is not empty, description does not matter. + new_message.m_title = WebFactory::instance()->stripTags(title); + new_message.m_contents = summary; + new_message.m_author = WebFactory::instance()->escapeHtml(messageAuthor(msg_element)); + + QString updated = textsFromPath(msg_element, m_atomNamespace, QSL("updated"), true).join(QSL(", ")); + + // Deal with creation date. + new_message.m_created = TextFactory::parseDateTime(updated); + new_message.m_createdFromFeed = !new_message.m_created.isNull(); + + if (!new_message.m_createdFromFeed) { + // Date was NOT obtained from the feed, set current date as creation date for the message. + new_message.m_created = current_time; + } + + // Deal with links + QDomNodeList elem_links = msg_element.toElement().elementsByTagNameNS(m_atomNamespace, QSL("link")); + QString last_link_alternate, last_link_other; + + for (int i = 0; i < elem_links.size(); i++) { + QDomElement link = elem_links.at(i).toElement(); + QString attribute = link.attribute(QSL("rel")); + + if (attribute == QSL("enclosure")) { + new_message.m_enclosures.append(Enclosure(link.attribute(QSL("href")), link.attribute(QSL("type")))); + + qDebug("Adding enclosure '%s' for the message.", qPrintable(new_message.m_enclosures.last().m_url)); + } + else if (attribute.isEmpty() || attribute == QSL("alternate")) { + last_link_alternate = link.attribute(QSL("href")); + } + else { + last_link_other = link.attribute(QSL("href")); + } + } + + if (!last_link_alternate.isEmpty()) { + new_message.m_url = last_link_alternate; + } + else if (!last_link_other.isEmpty()) { + new_message.m_url = last_link_other; + } + else if (!new_message.m_enclosures.isEmpty()) { + new_message.m_url = new_message.m_enclosures.first().m_url; + } + + return new_message; +} + +QString AtomParser::messageAuthor(const QDomElement &msg_element) const { + QDomNodeList authors = msg_element.elementsByTagNameNS(m_atomNamespace, QSL("author")); + QStringList author_str; + + for (int i = 0; i < authors.size(); i++) { + QDomNodeList names = authors.at(i).toElement().elementsByTagNameNS(m_atomNamespace, QSL("name")); + + if (!names.isEmpty()) { + author_str.append(names.at(0).toElement().text()); + } + } + + return author_str.join(", "); +} + +QDomNodeList AtomParser::messageElements() { + return m_xml.elementsByTagNameNS(m_atomNamespace, QSL("entry")); +} diff --git a/src/core/atomparser.h b/src/services/standard/atomparser.h similarity index 65% rename from src/core/atomparser.h rename to src/services/standard/atomparser.h index 91d7896c7..82e6f22f0 100644 --- a/src/core/atomparser.h +++ b/src/services/standard/atomparser.h @@ -18,17 +18,27 @@ #ifndef ATOMPARSER_H #define ATOMPARSER_H +#include "services/standard/feedparser.h" + #include "core/message.h" #include +#include -class AtomParser { +class AtomParser : public FeedParser { public: - explicit AtomParser(); + explicit AtomParser(const QString &data); virtual ~AtomParser(); - QList parseXmlData(const QString &data); + private: + QDomNodeList messageElements(); + QString feedAuthor() const; + Message extractMessage(const QDomElement &msg_element, QDateTime current_time) const; + QString messageAuthor(const QDomElement &msg_element) const; + + private: + QString m_atomNamespace; }; #endif // ATOMPARSER_H diff --git a/src/services/standard/feedparser.cpp b/src/services/standard/feedparser.cpp new file mode 100644 index 000000000..cf0d0ea8d --- /dev/null +++ b/src/services/standard/feedparser.cpp @@ -0,0 +1,96 @@ +// This file is part of RSS Guard. +// +// Copyright (C) 2011-2017 by Martin Rotter +// +// RSS Guard is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// RSS Guard is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with RSS Guard. If not, see . + +#include "services/standard/feedparser.h" + +#include "exceptions/applicationexception.h" + + +FeedParser::FeedParser(const QString &data) : m_xmlData(data) { + m_xml.setContent(m_xmlData, true); +} + +FeedParser::~FeedParser() { +} + +QList FeedParser::messages() { + QString feed_author = feedAuthor(); + + QList messages; + QDateTime current_time = QDateTime::currentDateTime(); + + // Pull out all messages. + QDomNodeList messages_in_xml = messageElements(); + + for (int i = 0; i < messages_in_xml.size(); i++) { + QDomNode message_item = messages_in_xml.item(i); + + try { + Message new_message = extractMessage(message_item.toElement(), current_time); + + if (new_message.m_author.isEmpty()) { + new_message.m_author = feed_author; + } + + messages.append(new_message); + } + catch (const ApplicationException &ex) { + qDebug(qPrintable(ex.message())); + } + } + + return messages; +} + +QStringList FeedParser::textsFromPath(const QDomElement &element, const QString &namespace_uri, + const QString &xml_path, bool only_first) const { + QStringList paths = xml_path.split('/'); + QStringList result; + QList current_elements; + current_elements.append(element); + + while (!paths.isEmpty()) { + QList next_elements; + QString next_local_name = paths.takeFirst(); + + foreach (QDomElement elem, current_elements) { + QDomNodeList elements = elem.elementsByTagNameNS(namespace_uri, next_local_name); + + for (int i = 0; i < elements.size(); i++) { + next_elements.append(elements.at(i).toElement()); + + if (only_first) { + break; + } + } + + if (next_elements.size() == 1 && only_first) { + break; + } + } + + current_elements = next_elements; + } + + if (!current_elements.isEmpty()) { + foreach (QDomElement elem, current_elements) { + result.append(elem.text()); + } + } + + return result; +} diff --git a/src/services/standard/feedparser.h b/src/services/standard/feedparser.h new file mode 100644 index 000000000..0829a3106 --- /dev/null +++ b/src/services/standard/feedparser.h @@ -0,0 +1,45 @@ +// This file is part of RSS Guard. +// +// Copyright (C) 2011-2017 by Martin Rotter +// +// RSS Guard is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// RSS Guard is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with RSS Guard. If not, see . + +#ifndef FEEDPARSER_H +#define FEEDPARSER_H + +#include +#include + +#include "core/message.h" + + +class FeedParser { + public: + explicit FeedParser(const QString &data); + virtual ~FeedParser(); + + virtual QList messages(); + + protected: + QStringList textsFromPath(const QDomElement &element, const QString &namespace_uri, const QString &xml_path, bool only_first) const; + virtual QDomNodeList messageElements() = 0; + virtual QString feedAuthor() const = 0; + virtual Message extractMessage(const QDomElement &msg_element, QDateTime current_time) const = 0; + + protected: + QString m_xmlData; + QDomDocument m_xml; +}; + +#endif // FEEDPARSER_H diff --git a/src/core/rdfparser.cpp b/src/services/standard/rdfparser.cpp similarity index 95% rename from src/core/rdfparser.cpp rename to src/services/standard/rdfparser.cpp index 821eb3773..bce9ad952 100644 --- a/src/core/rdfparser.cpp +++ b/src/services/standard/rdfparser.cpp @@ -15,7 +15,7 @@ // You should have received a copy of the GNU General Public License // along with RSS Guard. If not, see . -#include "core/rdfparser.h" +#include "services/standard/rdfparser.h" #include "miscellaneous/textfactory.h" #include "network-web/webfactory.h" diff --git a/src/core/rdfparser.h b/src/services/standard/rdfparser.h similarity index 100% rename from src/core/rdfparser.h rename to src/services/standard/rdfparser.h diff --git a/src/core/rssparser.cpp b/src/services/standard/rssparser.cpp similarity index 96% rename from src/core/rssparser.cpp rename to src/services/standard/rssparser.cpp index 8da0af9ea..bdfcd328e 100644 --- a/src/core/rssparser.cpp +++ b/src/services/standard/rssparser.cpp @@ -15,7 +15,7 @@ // You should have received a copy of the GNU General Public License // along with RSS Guard. If not, see . -#include "core/rssparser.h" +#include "services/standard/rssparser.h" #include "miscellaneous/textfactory.h" #include "network-web/webfactory.h" diff --git a/src/core/rssparser.h b/src/services/standard/rssparser.h similarity index 100% rename from src/core/rssparser.h rename to src/services/standard/rssparser.h diff --git a/src/services/standard/standardfeed.cpp b/src/services/standard/standardfeed.cpp index 2f05735dd..4e1de2bcb 100755 --- a/src/services/standard/standardfeed.cpp +++ b/src/services/standard/standardfeed.cpp @@ -18,9 +18,9 @@ #include "services/standard/standardfeed.h" #include "definitions/definitions.h" -#include "core/rssparser.h" -#include "core/rdfparser.h" -#include "core/atomparser.h" +#include "services/standard/rssparser.h" +#include "services/standard/rdfparser.h" +#include "services/standard/atomparser.h" #include "core/feedsmodel.h" #include "miscellaneous/databasequeries.h" #include "miscellaneous/textfactory.h" @@ -454,7 +454,7 @@ QList StandardFeed::obtainNewMessages(bool *error_during_obtaining) { break; case StandardFeed::Atom10: - messages = AtomParser().parseXmlData(formatted_feed_contents); + messages = AtomParser(formatted_feed_contents).messages(); default: break;