diff --git a/resources/rssguard.qrc b/resources/rssguard.qrc
index b43cf2970..d73ac7411 100644
--- a/resources/rssguard.qrc
+++ b/resources/rssguard.qrc
@@ -26,6 +26,7 @@
scripts/adblock/adblock-server.js
scripts/readability/readabilize-article.js
+ scripts/article-extractor/extract-article.mjs
scripts/filters/blacklist.js
scripts/filters/whitelist.js
diff --git a/resources/scripts/article-extractor/extract-article.mjs b/resources/scripts/article-extractor/extract-article.mjs
new file mode 100644
index 000000000..f118769a8
--- /dev/null
+++ b/resources/scripts/article-extractor/extract-article.mjs
@@ -0,0 +1,11 @@
+import { extract } from '@extractus/article-extractor'
+
+const input = process.argv[2];
+
+try {
+ const article = await extract(input);
+ console.log(JSON.stringify(article));
+}
+catch (err) {
+ console.error(err);
+}
\ No newline at end of file
diff --git a/src/librssguard/CMakeLists.txt b/src/librssguard/CMakeLists.txt
index 7df9f3a98..5412dac88 100644
--- a/src/librssguard/CMakeLists.txt
+++ b/src/librssguard/CMakeLists.txt
@@ -253,6 +253,8 @@ set(SOURCES
network-web/adblock/adblockrequestinfo.h
network-web/apiserver.cpp
network-web/apiserver.h
+ network-web/articleparse.cpp
+ network-web/articleparse.h
network-web/basenetworkaccessmanager.cpp
network-web/basenetworkaccessmanager.h
network-web/cookiejar.cpp
diff --git a/src/librssguard/gui/webbrowser.cpp b/src/librssguard/gui/webbrowser.cpp
index dc4540277..611bd8d5c 100644
--- a/src/librssguard/gui/webbrowser.cpp
+++ b/src/librssguard/gui/webbrowser.cpp
@@ -12,10 +12,12 @@
#include "miscellaneous/application.h"
#include "miscellaneous/iconfactory.h"
#include "miscellaneous/settings.h"
+#include "network-web/articleparse.h"
#include "network-web/networkfactory.h"
#include "network-web/readability.h"
#include "network-web/webfactory.h"
+#include
#include
#include
#include
@@ -37,7 +39,10 @@ WebBrowser::WebBrowser(WebViewer* viewer, QWidget* parent)
#endif
m_actionReadabilePage(new QAction(qApp->icons()->fromTheme(QSL("text-html")),
tr("View website in reader mode"),
- this)) {
+ this)),
+ m_actionGetFullArticle(new QAction(qApp->icons()->fromTheme(QSL("download"), QSL("browser-download")),
+ tr("Load full source article"),
+ this)) {
if (m_webView == nullptr) {
m_webView = qApp->createWebView();
dynamic_cast(m_webView)->setParent(this);
@@ -87,6 +92,7 @@ void WebBrowser::createConnections() {
connect(m_actionOpenInSystemBrowser, &QAction::triggered, this, &WebBrowser::openCurrentSiteInSystemBrowser);
connect(m_actionReadabilePage, &QAction::triggered, this, &WebBrowser::readabilePage);
+ connect(m_actionGetFullArticle, &QAction::triggered, this, &WebBrowser::getFullArticle);
#if defined(ENABLE_MEDIAPLAYER)
connect(m_actionPlayPageInMediaPlayer, &QAction::triggered, this, &WebBrowser::playCurrentSiteInMediaPlayer);
@@ -99,6 +105,9 @@ void WebBrowser::createConnections() {
connect(qApp->web()->readability(), &Readability::htmlReadabled, this, &WebBrowser::setReadabledHtml);
connect(qApp->web()->readability(), &Readability::errorOnHtmlReadabiliting, this, &WebBrowser::readabilityFailed);
+
+ connect(qApp->web()->articleParse(), &ArticleParse::articleParsed, this, &WebBrowser::setFullArticleHtml);
+ connect(qApp->web()->articleParse(), &ArticleParse::errorOnArticlePArsing, this, &WebBrowser::fullArticleFailed);
}
void WebBrowser::updateUrl(const QUrl& url) {
@@ -182,6 +191,11 @@ void WebBrowser::readabilePage() {
qApp->web()->readability()->makeHtmlReadable(this, m_webView->html(), m_webView->url().toString());
}
+void WebBrowser::getFullArticle() {
+ m_actionGetFullArticle->setEnabled(false);
+ qApp->web()->articleParse()->parseArticle(this, m_webView->url().toString());
+}
+
bool WebBrowser::eventFilter(QObject* watched, QEvent* event) {
Q_UNUSED(watched)
@@ -292,6 +306,21 @@ void WebBrowser::readabilityFailed(QObject* sndr, const QString& error) {
}
}
+void WebBrowser::setFullArticleHtml(QObject* sndr, const QString& json_answer) {
+ if (sndr == this && !json_answer.isEmpty()) {
+ QJsonDocument json_doc = QJsonDocument::fromJson(json_answer.toUtf8());
+ QString better_html = json_doc["content"].toString();
+
+ m_webView->setReadabledHtml(better_html, m_webView->url());
+ }
+}
+
+void WebBrowser::fullArticleFailed(QObject* sndr, const QString& error) {
+ if (sndr == this && !error.isEmpty()) {
+ m_webView->setReadabledHtml(error, m_webView->url());
+ }
+}
+
void WebBrowser::initializeLayout() {
m_toolBar->setFloatable(false);
m_toolBar->setMovable(false);
@@ -324,9 +353,11 @@ void WebBrowser::initializeLayout() {
m_actionOpenInSystemBrowser->setEnabled(false);
m_actionReadabilePage->setEnabled(false);
+ m_actionGetFullArticle->setEnabled(false);
// Add needed actions into toolbar.
m_toolBar->addAction(m_actionOpenInSystemBrowser);
+ m_toolBar->addAction(m_actionGetFullArticle);
m_toolBar->addAction(m_actionReadabilePage);
#if defined(ENABLE_MEDIAPLAYER)
@@ -358,6 +389,7 @@ void WebBrowser::onLoadingStarted() {
m_loadingProgress->show();
m_actionOpenInSystemBrowser->setEnabled(false);
m_actionReadabilePage->setEnabled(false);
+ m_actionGetFullArticle->setEnabled(false);
#if defined(ENABLE_MEDIAPLAYER)
m_actionPlayPageInMediaPlayer->setEnabled(false);
@@ -375,6 +407,7 @@ void WebBrowser::onLoadingFinished(bool success) {
if (url.isValid() && !url.host().isEmpty()) {
m_actionOpenInSystemBrowser->setEnabled(true);
+ m_actionGetFullArticle->setEnabled(true);
m_actionReadabilePage->setEnabled(true);
#if defined(ENABLE_MEDIAPLAYER)
@@ -384,6 +417,7 @@ void WebBrowser::onLoadingFinished(bool success) {
else {
m_actionOpenInSystemBrowser->setEnabled(false);
m_actionReadabilePage->setEnabled(false);
+ m_actionGetFullArticle->setEnabled(false);
#if defined(ENABLE_MEDIAPLAYER)
m_actionPlayPageInMediaPlayer->setEnabled(false);
diff --git a/src/librssguard/gui/webbrowser.h b/src/librssguard/gui/webbrowser.h
index 5692cc326..9ede837e5 100644
--- a/src/librssguard/gui/webbrowser.h
+++ b/src/librssguard/gui/webbrowser.h
@@ -74,9 +74,14 @@ class RSSGUARD_DLLSPEC WebBrowser : public TabContent {
void newWindowRequested(WebViewer* viewer);
void readabilePage();
+ void getFullArticle();
+
void setReadabledHtml(QObject* sndr, const QString& better_html);
void readabilityFailed(QObject* sndr, const QString& error);
+ void setFullArticleHtml(QObject* sndr, const QString& json_answer);
+ void fullArticleFailed(QObject* sndr, const QString& error);
+
signals:
void windowCloseRequested();
void iconChanged(int index, const QIcon& icon);
@@ -106,6 +111,7 @@ class RSSGUARD_DLLSPEC WebBrowser : public TabContent {
#endif
QAction* m_actionReadabilePage;
+ QAction* m_actionGetFullArticle;
QList m_messages;
QPointer m_root;
diff --git a/src/librssguard/network-web/articleparse.cpp b/src/librssguard/network-web/articleparse.cpp
new file mode 100644
index 000000000..6e83553c1
--- /dev/null
+++ b/src/librssguard/network-web/articleparse.cpp
@@ -0,0 +1,140 @@
+// For license of this file, see /LICENSE.md.
+
+#include "network-web/articleparse.h"
+
+#include "3rd-party/boolinq/boolinq.h"
+#include "exceptions/applicationexception.h"
+#include "miscellaneous/application.h"
+
+#include
+
+#define EXTRACTOR_PACKAGE "@extractus/article-extractor"
+#define EXTRACTOR_VERSION "8.0.7"
+
+ArticleParse::ArticleParse(QObject* parent) : QObject{parent}, m_modulesInstalling(false), m_modulesInstalled(false) {
+ connect(qApp->nodejs(), &NodeJs::packageInstalledUpdated, this, &ArticleParse::onPackageReady);
+ connect(qApp->nodejs(), &NodeJs::packageError, this, &ArticleParse::onPackageError);
+}
+
+void ArticleParse::onPackageReady(const QList& pkgs, bool already_up_to_date) {
+ Q_UNUSED(already_up_to_date)
+
+ bool concerns_extractor = boolinq::from(pkgs).any([](const NodeJs::PackageMetadata& pkg) {
+ return pkg.m_name == QSL(EXTRACTOR_PACKAGE);
+ });
+
+ if (!concerns_extractor) {
+ return;
+ }
+
+ m_modulesInstalled = true;
+ m_modulesInstalling = false;
+
+ qApp->showGuiMessage(Notification::Event::NodePackageUpdated,
+ {tr("Packages for article-extractor are installed"),
+ tr("Press the button once more!"),
+ QSystemTrayIcon::MessageIcon::Information},
+ {true, true, false});
+
+ // Emit this just to allow readability again for user.
+ emit articleParsed(nullptr, tr("Packages for article-extractor are installed. You can now use this feature!"));
+}
+
+void ArticleParse::onPackageError(const QList& pkgs, const QString& error) {
+ bool concerns_extractor = boolinq::from(pkgs).any([](const NodeJs::PackageMetadata& pkg) {
+ return pkg.m_name == QSL(EXTRACTOR_PACKAGE);
+ });
+
+ if (!concerns_extractor) {
+ return;
+ }
+
+ m_modulesInstalled = m_modulesInstalling = false;
+
+ qApp->showGuiMessage(Notification::Event::NodePackageUpdated,
+ {tr("Packages for article-extractor are NOT installed"),
+ tr("There is error: %1").arg(error),
+ QSystemTrayIcon::MessageIcon::Critical},
+ {true, true, false});
+
+ // Emit this just to allow readability again for user.
+ emit articleParsed(nullptr, tr("Packages for article-extractor are NOT installed. There is error: %1").arg(error));
+}
+
+void ArticleParse::parseArticle(QObject* sndr, const QString& url) {
+ if (!m_modulesInstalled) {
+ // NOTE: Here we use MJS file directly placed in its NODE package folder
+ // because NODE_PATH is not supported for MJS files.
+ m_scriptFilename = qApp->nodejs()->processedPackageFolder() + QDir::separator() + QSL("extract-article.mjs");
+
+ if (!IOFactory::copyFile(QSL(":/scripts/article-extractor/extract-article.mjs"), m_scriptFilename)) {
+ qCriticalNN << LOGSEC_ADBLOCK << "Failed to copy article-extractor script to TEMP.";
+ }
+
+ try {
+ NodeJs::PackageStatus st_extractor =
+ qApp->nodejs()->packageStatus({QSL(EXTRACTOR_PACKAGE), QSL(EXTRACTOR_VERSION)});
+
+ if (st_extractor != NodeJs::PackageStatus::UpToDate) {
+ if (!m_modulesInstalling) {
+ // We make sure to update modules.
+ m_modulesInstalling = true;
+
+ qApp->showGuiMessage(Notification::Event::NodePackageUpdated,
+ {tr("Node.js libraries not installed"),
+ tr("%1 will now install some needed libraries, this will take only a few seconds. "
+ "You will be notified when installation is complete.")
+ .arg(QSL(APP_NAME)),
+ QSystemTrayIcon::MessageIcon::Warning},
+ {true, true, false});
+ qApp->nodejs()->installUpdatePackages({{QSL(EXTRACTOR_PACKAGE), QSL(EXTRACTOR_VERSION)}});
+ }
+
+ return;
+ }
+ else {
+ m_modulesInstalled = true;
+ }
+ }
+ catch (const ApplicationException& ex) {
+ qApp->showGuiMessage(Notification::Event::NodePackageUpdated,
+ {tr("Node.js libraries not installed"),
+ tr("Node.js is not configured properly. Go to \"Settings\" -> \"Node.js\" and check "
+ "if your Node.js is properly configured."),
+ QSystemTrayIcon::MessageIcon::Critical},
+ {true, true, false});
+
+ qCriticalNN << LOGSEC_CORE << "Failed to check for Node.js package status:" << QUOTE_W_SPACE_DOT(ex.message());
+
+ // Emit this just to allow readability again for user.
+ emit articleParsed(sndr,
+ tr("Node.js is not configured properly. Go to \"Settings\" -> \"Node.js\" and check "
+ "if your Node.js is properly configured."));
+ }
+ }
+
+ QProcess* proc = new QProcess(this);
+
+ connect(proc,
+ QOverload::of(&QProcess::finished),
+ this,
+ [=](int exit_code, QProcess::ExitStatus exit_status) {
+ onParsingFinished(sndr, exit_code, exit_status);
+ });
+
+ qApp->nodejs()->runScript(proc, m_scriptFilename, {url});
+}
+
+void ArticleParse::onParsingFinished(QObject* sndr, int exit_code, QProcess::ExitStatus exit_status) {
+ QProcess* proc = qobject_cast(sender());
+
+ if (exit_status == QProcess::ExitStatus::NormalExit && exit_code == EXIT_SUCCESS) {
+ emit articleParsed(sndr, QString::fromUtf8(proc->readAllStandardOutput()));
+ }
+ else {
+ QString err = QString::fromUtf8(proc->readAllStandardError());
+ emit errorOnArticlePArsing(sndr, err);
+ }
+
+ proc->deleteLater();
+}
diff --git a/src/librssguard/network-web/articleparse.h b/src/librssguard/network-web/articleparse.h
new file mode 100644
index 000000000..672273593
--- /dev/null
+++ b/src/librssguard/network-web/articleparse.h
@@ -0,0 +1,34 @@
+// For license of this file, see /LICENSE.md.
+
+#ifndef ARTICLEPARSE_H
+#define ARTICLEPARSE_H
+
+#include "miscellaneous/nodejs.h"
+
+#include
+#include
+
+class ArticleParse : public QObject {
+ Q_OBJECT
+
+ public:
+ explicit ArticleParse(QObject* parent = nullptr);
+
+ void parseArticle(QObject* sndr, const QString& url);
+
+ private slots:
+ void onParsingFinished(QObject* sndr, int exit_code, QProcess::ExitStatus exit_status);
+ void onPackageReady(const QList& pkgs, bool already_up_to_date);
+ void onPackageError(const QList& pkgs, const QString& error);
+
+ signals:
+ void articleParsed(QObject* sndr, const QString& better_html);
+ void errorOnArticlePArsing(QObject* sndr, const QString& error);
+
+ private:
+ bool m_modulesInstalling;
+ bool m_modulesInstalled;
+ QString m_scriptFilename;
+};
+
+#endif // ARTICLEPARSE_H
diff --git a/src/librssguard/network-web/webfactory.cpp b/src/librssguard/network-web/webfactory.cpp
index 5ab3af5f5..14f8637a0 100644
--- a/src/librssguard/network-web/webfactory.cpp
+++ b/src/librssguard/network-web/webfactory.cpp
@@ -8,6 +8,7 @@
#include "miscellaneous/settings.h"
#include "network-web/adblock/adblockmanager.h"
#include "network-web/apiserver.h"
+#include "network-web/articleparse.h"
#include "network-web/cookiejar.h"
#include "network-web/readability.h"
@@ -54,6 +55,7 @@ WebFactory::WebFactory(QObject* parent) : QObject(parent), m_apiServer(nullptr),
m_cookieJar = new CookieJar(this);
m_readability = new Readability(this);
+ m_articleParse = new ArticleParse(this);
#if defined(NO_LITE)
#if QT_VERSION >= 0x050D00 // Qt >= 5.13.0
@@ -568,6 +570,10 @@ Readability* WebFactory::readability() const {
return m_readability;
}
+ArticleParse* WebFactory::articleParse() const {
+ return m_articleParse;
+}
+
void WebFactory::startApiServer() {
m_apiServer = new ApiServer(this);
m_apiServer->setListenAddressPort(QSL("http://localhost:54123"), true);
diff --git a/src/librssguard/network-web/webfactory.h b/src/librssguard/network-web/webfactory.h
index acc55a15d..b1db2e1ef 100644
--- a/src/librssguard/network-web/webfactory.h
+++ b/src/librssguard/network-web/webfactory.h
@@ -20,6 +20,7 @@ class AdBlockManager;
class CookieJar;
class ApiServer;
class Readability;
+class ArticleParse;
class RSSGUARD_DLLSPEC WebFactory : public QObject {
Q_OBJECT
@@ -50,6 +51,7 @@ class RSSGUARD_DLLSPEC WebFactory : public QObject {
CookieJar* cookieJar() const;
Readability* readability() const;
+ ArticleParse* articleParse() const;
void startApiServer();
void stopApiServer();
@@ -95,6 +97,7 @@ class RSSGUARD_DLLSPEC WebFactory : public QObject {
ApiServer* m_apiServer;
CookieJar* m_cookieJar;
Readability* m_readability;
+ ArticleParse* m_articleParse;
QString m_customUserAgent;
};