Better ATOM parsing.
This commit is contained in:
parent
410007defb
commit
95a5831424
12 changed files with 298 additions and 135 deletions
|
@ -5,6 +5,7 @@ Added:
|
|||
▪ Auto-update status of feeds is now more general and complete. (
|
||||
|
||||
Changed:
|
||||
▪ Better ATOM parsing, respects now XML namespaces, bit better link parsing and other stuff. (issue #104)
|
||||
▪ Folder which holds SQL scripts got renamed to "sql".
|
||||
▪ Tweaked some conditions for determining newly "updated" messages in ATOM format. (issue #103)
|
||||
|
||||
|
|
14
rssguard.pro
14
rssguard.pro
|
@ -319,9 +319,10 @@ HEADERS += src/core/feeddownloader.h \
|
|||
src/gui/settings/settingsfeedsmessages.h \
|
||||
src/gui/settings/settingsdownloads.h \
|
||||
src/miscellaneous/feedreader.h \
|
||||
src/core/atomparser.h \
|
||||
src/core/rssparser.h \
|
||||
src/core/rdfparser.h
|
||||
src/services/standard/atomparser.h \
|
||||
src/services/standard/feedparser.h \
|
||||
src/services/standard/rdfparser.h \
|
||||
src/services/standard/rssparser.h
|
||||
|
||||
SOURCES += src/core/feeddownloader.cpp \
|
||||
src/core/feedsmodel.cpp \
|
||||
|
@ -436,9 +437,10 @@ SOURCES += src/core/feeddownloader.cpp \
|
|||
src/gui/settings/settingsfeedsmessages.cpp \
|
||||
src/gui/settings/settingsdownloads.cpp \
|
||||
src/miscellaneous/feedreader.cpp \
|
||||
src/core/atomparser.cpp \
|
||||
src/core/rssparser.cpp \
|
||||
src/core/rdfparser.cpp
|
||||
src/services/standard/atomparser.cpp \
|
||||
src/services/standard/feedparser.cpp \
|
||||
src/services/standard/rdfparser.cpp \
|
||||
src/services/standard/rssparser.cpp
|
||||
|
||||
FORMS += src/gui/toolbareditor.ui \
|
||||
src/network-web/downloaditem.ui \
|
||||
|
|
|
@ -1,120 +0,0 @@
|
|||
// This file is part of RSS Guard.
|
||||
//
|
||||
// Copyright (C) 2011-2017 by Martin Rotter <rotter.martinos@gmail.com>
|
||||
//
|
||||
// RSS Guard is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// RSS Guard is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with RSS Guard. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
#include "core/atomparser.h"
|
||||
|
||||
#include "miscellaneous/textfactory.h"
|
||||
#include "network-web/webfactory.h"
|
||||
|
||||
#include <QDomDocument>
|
||||
|
||||
|
||||
AtomParser::AtomParser() {
|
||||
}
|
||||
|
||||
AtomParser::~AtomParser() {
|
||||
}
|
||||
|
||||
QList<Message> AtomParser::parseXmlData(const QString &data) {
|
||||
QList<Message> messages;
|
||||
QDomDocument xml_file;
|
||||
QDateTime current_time = QDateTime::currentDateTime();
|
||||
const QString atom_ns = QSL("http://www.w3.org/2005/Atom");
|
||||
|
||||
xml_file.setContent(data, true);
|
||||
|
||||
// Pull out all messages.
|
||||
QDomNodeList messages_in_xml = xml_file.elementsByTagName(QSL("entry"));
|
||||
|
||||
for (int i = 0; i < messages_in_xml.size(); i++) {
|
||||
QDomNode message_item = messages_in_xml.item(i);
|
||||
Message new_message;
|
||||
|
||||
// Deal with titles & descriptions.
|
||||
QString elem_title = message_item.namedItem(QSL("title")).toElement().text().simplified();
|
||||
QString elem_summary = message_item.namedItem(QSL("summary")).toElement().text();
|
||||
|
||||
if (elem_summary.isEmpty()) {
|
||||
elem_summary = message_item.namedItem(QSL("content")).toElement().text();
|
||||
}
|
||||
|
||||
// Now we obtained maximum of information for title & description.
|
||||
if (elem_title.isEmpty()) {
|
||||
if (elem_summary.isEmpty()) {
|
||||
// BOTH title and description are empty, skip this message.
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
// Title is empty but description is not.
|
||||
new_message.m_title = WebFactory::instance()->stripTags(elem_summary.simplified());
|
||||
new_message.m_contents = elem_summary;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Title is not empty, description does not matter.
|
||||
new_message.m_title = WebFactory::instance()->stripTags(elem_title);
|
||||
new_message.m_contents = elem_summary;
|
||||
}
|
||||
|
||||
// Deal with link.
|
||||
QDomNodeList elem_links = message_item.toElement().elementsByTagName(QSL("link"));
|
||||
|
||||
for (int i = 0; i < elem_links.size(); i++) {
|
||||
QDomElement link = elem_links.at(i).toElement();
|
||||
|
||||
if (link.attribute(QSL("rel")) == QSL("enclosure")) {
|
||||
new_message.m_enclosures.append(Enclosure(link.attribute(QSL("href")), link.attribute(QSL("type"))));
|
||||
|
||||
qDebug("Adding enclosure '%s' for the message.", qPrintable(new_message.m_enclosures.last().m_url));
|
||||
}
|
||||
else {
|
||||
new_message.m_url = link.attribute(QSL("href"));
|
||||
}
|
||||
}
|
||||
|
||||
if (new_message.m_url.isEmpty() && !new_message.m_enclosures.isEmpty()) {
|
||||
new_message.m_url = new_message.m_enclosures.first().m_url;
|
||||
}
|
||||
|
||||
// Deal with authors.
|
||||
new_message.m_author = WebFactory::instance()->escapeHtml(message_item.namedItem(QSL("author")).namedItem(QSL("name")).toElement().text());
|
||||
|
||||
// Deal with creation date.
|
||||
new_message.m_created = TextFactory::parseDateTime(message_item.namedItem(QSL("updated")).toElement().text());
|
||||
new_message.m_createdFromFeed = !new_message.m_created.isNull();
|
||||
|
||||
if (!new_message.m_createdFromFeed) {
|
||||
// Date was NOT obtained from the feed, set current date as creation date for the message.
|
||||
new_message.m_created = current_time;
|
||||
}
|
||||
|
||||
// WARNING: There is a difference between "" and QString() in terms of nullptr SQL values!
|
||||
// This is because of difference in QString::isNull() and QString::isEmpty(), the "" is not null
|
||||
// while QString() is.
|
||||
if (new_message.m_author.isNull()) {
|
||||
new_message.m_author = "";
|
||||
}
|
||||
|
||||
if (new_message.m_url.isNull()) {
|
||||
new_message.m_url = "";
|
||||
}
|
||||
|
||||
messages.append(new_message);
|
||||
}
|
||||
|
||||
return messages;
|
||||
}
|
129
src/services/standard/atomparser.cpp
Normal file
129
src/services/standard/atomparser.cpp
Normal file
|
@ -0,0 +1,129 @@
|
|||
// This file is part of RSS Guard.
|
||||
//
|
||||
// Copyright (C) 2011-2017 by Martin Rotter <rotter.martinos@gmail.com>
|
||||
//
|
||||
// RSS Guard is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// RSS Guard is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with RSS Guard. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
#include "services/standard/atomparser.h"
|
||||
|
||||
#include "miscellaneous/textfactory.h"
|
||||
#include "network-web/webfactory.h"
|
||||
|
||||
#include "exceptions/applicationexception.h"
|
||||
|
||||
|
||||
AtomParser::AtomParser(const QString &data) : FeedParser(data), m_atomNamespace(QSL("http://www.w3.org/2005/Atom")) {
|
||||
}
|
||||
|
||||
AtomParser::~AtomParser() {
|
||||
}
|
||||
|
||||
QString AtomParser::feedAuthor() const {
|
||||
QDomNodeList authors = m_xml.documentElement().elementsByTagNameNS(m_atomNamespace, QSL("author"));
|
||||
QStringList author_str;
|
||||
|
||||
for (int i = 0; i < authors.size(); i++) {
|
||||
QDomNodeList names = authors.at(i).toElement().elementsByTagNameNS(m_atomNamespace, QSL("name"));
|
||||
|
||||
if (!names.isEmpty()) {
|
||||
author_str.append(names.at(0).toElement().text());
|
||||
}
|
||||
}
|
||||
|
||||
return author_str.join(", ");
|
||||
}
|
||||
|
||||
Message AtomParser::extractMessage(const QDomElement &msg_element, QDateTime current_time) const {
|
||||
Message new_message;
|
||||
QString title = textsFromPath(msg_element, m_atomNamespace, QSL("title"), true).join(QSL(", "));
|
||||
QString summary = textsFromPath(msg_element, m_atomNamespace, QSL("content"), true).join(QSL(", "));
|
||||
|
||||
if (summary.isEmpty()) {
|
||||
summary = textsFromPath(msg_element, m_atomNamespace, QSL("summary"), true).join(QSL(", "));
|
||||
}
|
||||
|
||||
// Now we obtained maximum of information for title & description.
|
||||
if (title.isEmpty() && summary.isEmpty()) {
|
||||
// BOTH title and description are empty, skip this message.
|
||||
throw new ApplicationException(QSL("Not enough data for the message."));
|
||||
}
|
||||
|
||||
// Title is not empty, description does not matter.
|
||||
new_message.m_title = WebFactory::instance()->stripTags(title);
|
||||
new_message.m_contents = summary;
|
||||
new_message.m_author = WebFactory::instance()->escapeHtml(messageAuthor(msg_element));
|
||||
|
||||
QString updated = textsFromPath(msg_element, m_atomNamespace, QSL("updated"), true).join(QSL(", "));
|
||||
|
||||
// Deal with creation date.
|
||||
new_message.m_created = TextFactory::parseDateTime(updated);
|
||||
new_message.m_createdFromFeed = !new_message.m_created.isNull();
|
||||
|
||||
if (!new_message.m_createdFromFeed) {
|
||||
// Date was NOT obtained from the feed, set current date as creation date for the message.
|
||||
new_message.m_created = current_time;
|
||||
}
|
||||
|
||||
// Deal with links
|
||||
QDomNodeList elem_links = msg_element.toElement().elementsByTagNameNS(m_atomNamespace, QSL("link"));
|
||||
QString last_link_alternate, last_link_other;
|
||||
|
||||
for (int i = 0; i < elem_links.size(); i++) {
|
||||
QDomElement link = elem_links.at(i).toElement();
|
||||
QString attribute = link.attribute(QSL("rel"));
|
||||
|
||||
if (attribute == QSL("enclosure")) {
|
||||
new_message.m_enclosures.append(Enclosure(link.attribute(QSL("href")), link.attribute(QSL("type"))));
|
||||
|
||||
qDebug("Adding enclosure '%s' for the message.", qPrintable(new_message.m_enclosures.last().m_url));
|
||||
}
|
||||
else if (attribute.isEmpty() || attribute == QSL("alternate")) {
|
||||
last_link_alternate = link.attribute(QSL("href"));
|
||||
}
|
||||
else {
|
||||
last_link_other = link.attribute(QSL("href"));
|
||||
}
|
||||
}
|
||||
|
||||
if (!last_link_alternate.isEmpty()) {
|
||||
new_message.m_url = last_link_alternate;
|
||||
}
|
||||
else if (!last_link_other.isEmpty()) {
|
||||
new_message.m_url = last_link_other;
|
||||
}
|
||||
else if (!new_message.m_enclosures.isEmpty()) {
|
||||
new_message.m_url = new_message.m_enclosures.first().m_url;
|
||||
}
|
||||
|
||||
return new_message;
|
||||
}
|
||||
|
||||
QString AtomParser::messageAuthor(const QDomElement &msg_element) const {
|
||||
QDomNodeList authors = msg_element.elementsByTagNameNS(m_atomNamespace, QSL("author"));
|
||||
QStringList author_str;
|
||||
|
||||
for (int i = 0; i < authors.size(); i++) {
|
||||
QDomNodeList names = authors.at(i).toElement().elementsByTagNameNS(m_atomNamespace, QSL("name"));
|
||||
|
||||
if (!names.isEmpty()) {
|
||||
author_str.append(names.at(0).toElement().text());
|
||||
}
|
||||
}
|
||||
|
||||
return author_str.join(", ");
|
||||
}
|
||||
|
||||
QDomNodeList AtomParser::messageElements() {
|
||||
return m_xml.elementsByTagNameNS(m_atomNamespace, QSL("entry"));
|
||||
}
|
|
@ -18,17 +18,27 @@
|
|||
#ifndef ATOMPARSER_H
|
||||
#define ATOMPARSER_H
|
||||
|
||||
#include "services/standard/feedparser.h"
|
||||
|
||||
#include "core/message.h"
|
||||
|
||||
#include <QList>
|
||||
#include <QDomDocument>
|
||||
|
||||
|
||||
class AtomParser {
|
||||
class AtomParser : public FeedParser {
|
||||
public:
|
||||
explicit AtomParser();
|
||||
explicit AtomParser(const QString &data);
|
||||
virtual ~AtomParser();
|
||||
|
||||
QList<Message> parseXmlData(const QString &data);
|
||||
private:
|
||||
QDomNodeList messageElements();
|
||||
QString feedAuthor() const;
|
||||
Message extractMessage(const QDomElement &msg_element, QDateTime current_time) const;
|
||||
QString messageAuthor(const QDomElement &msg_element) const;
|
||||
|
||||
private:
|
||||
QString m_atomNamespace;
|
||||
};
|
||||
|
||||
#endif // ATOMPARSER_H
|
96
src/services/standard/feedparser.cpp
Normal file
96
src/services/standard/feedparser.cpp
Normal file
|
@ -0,0 +1,96 @@
|
|||
// This file is part of RSS Guard.
|
||||
//
|
||||
// Copyright (C) 2011-2017 by Martin Rotter <rotter.martinos@gmail.com>
|
||||
//
|
||||
// RSS Guard is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// RSS Guard is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with RSS Guard. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
#include "services/standard/feedparser.h"
|
||||
|
||||
#include "exceptions/applicationexception.h"
|
||||
|
||||
|
||||
FeedParser::FeedParser(const QString &data) : m_xmlData(data) {
|
||||
m_xml.setContent(m_xmlData, true);
|
||||
}
|
||||
|
||||
FeedParser::~FeedParser() {
|
||||
}
|
||||
|
||||
QList<Message> FeedParser::messages() {
|
||||
QString feed_author = feedAuthor();
|
||||
|
||||
QList<Message> messages;
|
||||
QDateTime current_time = QDateTime::currentDateTime();
|
||||
|
||||
// Pull out all messages.
|
||||
QDomNodeList messages_in_xml = messageElements();
|
||||
|
||||
for (int i = 0; i < messages_in_xml.size(); i++) {
|
||||
QDomNode message_item = messages_in_xml.item(i);
|
||||
|
||||
try {
|
||||
Message new_message = extractMessage(message_item.toElement(), current_time);
|
||||
|
||||
if (new_message.m_author.isEmpty()) {
|
||||
new_message.m_author = feed_author;
|
||||
}
|
||||
|
||||
messages.append(new_message);
|
||||
}
|
||||
catch (const ApplicationException &ex) {
|
||||
qDebug(qPrintable(ex.message()));
|
||||
}
|
||||
}
|
||||
|
||||
return messages;
|
||||
}
|
||||
|
||||
QStringList FeedParser::textsFromPath(const QDomElement &element, const QString &namespace_uri,
|
||||
const QString &xml_path, bool only_first) const {
|
||||
QStringList paths = xml_path.split('/');
|
||||
QStringList result;
|
||||
QList<QDomElement> current_elements;
|
||||
current_elements.append(element);
|
||||
|
||||
while (!paths.isEmpty()) {
|
||||
QList<QDomElement> next_elements;
|
||||
QString next_local_name = paths.takeFirst();
|
||||
|
||||
foreach (QDomElement elem, current_elements) {
|
||||
QDomNodeList elements = elem.elementsByTagNameNS(namespace_uri, next_local_name);
|
||||
|
||||
for (int i = 0; i < elements.size(); i++) {
|
||||
next_elements.append(elements.at(i).toElement());
|
||||
|
||||
if (only_first) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (next_elements.size() == 1 && only_first) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
current_elements = next_elements;
|
||||
}
|
||||
|
||||
if (!current_elements.isEmpty()) {
|
||||
foreach (QDomElement elem, current_elements) {
|
||||
result.append(elem.text());
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
45
src/services/standard/feedparser.h
Normal file
45
src/services/standard/feedparser.h
Normal file
|
@ -0,0 +1,45 @@
|
|||
// This file is part of RSS Guard.
|
||||
//
|
||||
// Copyright (C) 2011-2017 by Martin Rotter <rotter.martinos@gmail.com>
|
||||
//
|
||||
// RSS Guard is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// RSS Guard is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with RSS Guard. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
#ifndef FEEDPARSER_H
|
||||
#define FEEDPARSER_H
|
||||
|
||||
#include <QDomDocument>
|
||||
#include <QString>
|
||||
|
||||
#include "core/message.h"
|
||||
|
||||
|
||||
class FeedParser {
|
||||
public:
|
||||
explicit FeedParser(const QString &data);
|
||||
virtual ~FeedParser();
|
||||
|
||||
virtual QList<Message> messages();
|
||||
|
||||
protected:
|
||||
QStringList textsFromPath(const QDomElement &element, const QString &namespace_uri, const QString &xml_path, bool only_first) const;
|
||||
virtual QDomNodeList messageElements() = 0;
|
||||
virtual QString feedAuthor() const = 0;
|
||||
virtual Message extractMessage(const QDomElement &msg_element, QDateTime current_time) const = 0;
|
||||
|
||||
protected:
|
||||
QString m_xmlData;
|
||||
QDomDocument m_xml;
|
||||
};
|
||||
|
||||
#endif // FEEDPARSER_H
|
|
@ -15,7 +15,7 @@
|
|||
// You should have received a copy of the GNU General Public License
|
||||
// along with RSS Guard. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
#include "core/rdfparser.h"
|
||||
#include "services/standard/rdfparser.h"
|
||||
|
||||
#include "miscellaneous/textfactory.h"
|
||||
#include "network-web/webfactory.h"
|
|
@ -15,7 +15,7 @@
|
|||
// You should have received a copy of the GNU General Public License
|
||||
// along with RSS Guard. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
#include "core/rssparser.h"
|
||||
#include "services/standard/rssparser.h"
|
||||
|
||||
#include "miscellaneous/textfactory.h"
|
||||
#include "network-web/webfactory.h"
|
|
@ -18,9 +18,9 @@
|
|||
#include "services/standard/standardfeed.h"
|
||||
|
||||
#include "definitions/definitions.h"
|
||||
#include "core/rssparser.h"
|
||||
#include "core/rdfparser.h"
|
||||
#include "core/atomparser.h"
|
||||
#include "services/standard/rssparser.h"
|
||||
#include "services/standard/rdfparser.h"
|
||||
#include "services/standard/atomparser.h"
|
||||
#include "core/feedsmodel.h"
|
||||
#include "miscellaneous/databasequeries.h"
|
||||
#include "miscellaneous/textfactory.h"
|
||||
|
@ -454,7 +454,7 @@ QList<Message> StandardFeed::obtainNewMessages(bool *error_during_obtaining) {
|
|||
break;
|
||||
|
||||
case StandardFeed::Atom10:
|
||||
messages = AtomParser().parseXmlData(formatted_feed_contents);
|
||||
messages = AtomParser(formatted_feed_contents).messages();
|
||||
|
||||
default:
|
||||
break;
|
||||
|
|
Loading…
Add table
Reference in a new issue