From edf696bae2f7c76f6727d9b7774847d4847ff9e9 Mon Sep 17 00:00:00 2001 From: Martin Rotter Date: Mon, 9 Oct 2023 11:12:14 +0200 Subject: [PATCH] totally reworked process argument tokenization, allow to use both single and double quotes and removing some other limitation --- docs/source/features/scraping.md | 44 ++--- src/librssguard/miscellaneous/textfactory.cpp | 177 +++++++++++++++--- src/librssguard/miscellaneous/textfactory.h | 2 +- .../standard/gui/standardfeeddetails.cpp | 17 +- 4 files changed, 183 insertions(+), 57 deletions(-) diff --git a/docs/source/features/scraping.md b/docs/source/features/scraping.md index e6996cac9..b531bc9b9 100644 --- a/docs/source/features/scraping.md +++ b/docs/source/features/scraping.md @@ -19,8 +19,24 @@ However, if you choose `Script` option, then you cannot provide URL of your feed Any errors in your script must be written to [**error output** (stderr)](https://en.wikipedia.org/wiki/Standard_streams#Standard_error_(stderr)). -```{warning} -As of RSS Guard 4.2.0, you cannot separate your arguments with `#`. If your argument contains spaces, then enclose it with DOUBLE quotes, for example `"my argument"`. DO NOT use SINGLE quotes to do that. +:::{warning} +If your path to executable contains backslashes as directory separators, make sure to escape them with another backslash. Quote each individual argument with double quotes `"arg"` or single quotes `'arg'` and separate all arguments with spaces. You have to escape some characters inside double-quoted argument, for example double quote itself like this `"arg with \"quoted\" part"`. + +Examples (one per line): + +``` +C:\\MyFolder\\My.exe "arg1" "arg2" "my \"quoted\" arg3" 'my "quoted" arg4' + +bash "%data%/scripts/download-feed.sh" + +%data%\jq.exe '{ version: "1.1", title: "Stars", items: map( . | .title=.full_name | .content_text=.description | .date_published=.pushed_at)}' +``` +::: + +RSS Guard offers [placeholder](userdata.md#data-placeholder) `%data%` which is automatically replaced with full path to RSS Guard user data folder and you can use this placeholder anywhere in your script call line. + +```{attention} +Working directory of process executing the script is set to point to RSS Guard [user data](userdata) folder. ``` Format of post-process script execution line can be seen on picture below. @@ -31,25 +47,7 @@ If everything goes well, script must return `0` as the process exit code, or a n Executable file must be always be specified, while arguments do not. Be very careful when quoting arguments. Tested examples of valid execution lines are: -| Command | Explanation | -| :--- | --- | -| `bash -c "curl 'https://github.com/martinrotter.atom'"` | Download ATOM feed file using Bash and Curl. | -| `Powershell Invoke-WebRequest "https://github.com/martinrotter.atom" \| Select-Object -ExpandProperty Content` | Download ATOM feed file with Powershell. | -| `php tweeper.php -v 0 "https://twitter.com/NSACareers"` | Scrape Twitter RSS feed file with [Tweeper](https://git.ao2.it/tweeper.git). Tweeper is the utility that produces RSS feed from Twitter and other similar social platforms. | - -```{note} -The above examples are cross-platform. You can use exactly the same command on Windows, Linux or macOS, if your operating system is properly configured. -``` - -RSS Guard offers [placeholder](userdata.md#data-placeholder) `%data%` which is automatically replaced with full path to RSS Guard user data folder, allowing you to make your configuration fully portable. You can, therefore, use something like this as a source script line: `bash %data%/scripts/download-feed.sh`. - -```{attention} -Working directory of process executing the script is set to point to RSS Guard [user data](userdata) folder. -``` - -There are [examples of website scrapers](https://github.com/martinrotter/rssguard/tree/master/resources/scripts/scrapers). Most of them are written in Python 3, so their execution line is similar to `python script.py`. Make sure to examine each script for more information on how to use it. - ----- +## Dataflow After your source feed data is downloaded either via URL or custom script, you can optionally post-process it with one more custom script, which will take **raw source data as input**. It must produce valid feed data to standard output while printing all error messages to error output. Here is little flowchart explaining where and when scripts are used: @@ -76,6 +74,10 @@ Typical post-processing filter might do things like CSS formatting, localization It's completely up to you if you decide to only use script as `Source` of the script or separate your custom functionality between `Source` script and `Post-process` script. Sometimes you might need different `Source` scripts for different online sources and the same `Post-process` script and vice versa. +## Example Scrapers +There are [examples of website scrapers](https://github.com/martinrotter/rssguard/tree/master/resources/scripts/scrapers). Most of them are written in Python 3, so their execution line is similar to `python "script.py"`. Make sure to examine each script for more information on how to use it. + +## 3rd-party Tools Third-party tools for scraping made to work with RSS Guard: * [CSS2RSS](https://github.com/Owyn/CSS2RSS) - can be used to scrape websites with CSS selectors. * [RSSGuardHelper](https://github.com/pipiscrew/RSSGuardHelper) - another CSS selectors helper. diff --git a/src/librssguard/miscellaneous/textfactory.cpp b/src/librssguard/miscellaneous/textfactory.cpp index b1510f9d1..c93b4c677 100644 --- a/src/librssguard/miscellaneous/textfactory.cpp +++ b/src/librssguard/miscellaneous/textfactory.cpp @@ -175,45 +175,166 @@ QString TextFactory::capitalizeFirstLetter(const QString& sts) { } } -QStringList TextFactory::tokenizeProcessArguments(QStringView command) { +enum class TokenState { + // We are not inside argument, we are between arguments. + Normal, + + // We have detected escape "\" character coming from double-quoted argument. + EscapedFromDoubleQuotes, + + // We have detected escape "\" character coming from spaced argument. + EscapedFromSpaced, + + // We are inside argument which was separated by spaces. + InsideArgSpaced, + + // We are inside argument. + InsideArgDoubleQuotes, + + // We are inside argument, do not evaluate anything, just take it all + // as arw text. + InsideArgSingleQuotes +}; + +QStringList TextFactory::tokenizeProcessArguments(const QString& command) { + // Each argument containing spaces must be enclosed with single '' or double "" quotes. + // Some characters must be escaped with \ to keep their textual values as + // long as double-quoted argument is used. + + if (command.isEmpty()) { + return {}; + } + + // We append space to end of command to make sure that + // ending space-separated argument is processed. + QString my_command = command + u' '; + + TokenState state = TokenState::Normal; QStringList args; - QString tmp; - int quote_count = 0; - bool in_quote = false; + QString arg; - for (int i = 0; i < command.size(); ++i) { - if (command.at(i) == QL1C('"')) { - ++quote_count; + for (QChar chr : my_command) { + switch (state) { + case TokenState::Normal: { + switch (chr.unicode()) { + case u'"': + // We start double-quoted argument. + state = TokenState::InsideArgDoubleQuotes; + continue; - if (quote_count == 3) { - quote_count = 0; - tmp += command.at(i); + case u'\'': + // We start single-quoted argument. + state = TokenState::InsideArgSingleQuotes; + continue; + + case u' ': + // Whitespace, just go on. + continue; + + default: + // We found some actual text which marks + // beginning of argument, we assume spaced argument. + arg.append(chr); + state = TokenState::InsideArgSpaced; + continue; + } + + break; } - continue; - } - - if (quote_count) { - if (quote_count == 1) { - in_quote = !in_quote; + case TokenState::EscapedFromDoubleQuotes: { + // Previous character was "\". + arg.append(chr); + state = TokenState::InsideArgDoubleQuotes; + break; } - quote_count = 0; - } - - if (!in_quote && command.at(i).isSpace()) { - if (!tmp.isEmpty()) { - args += tmp; - tmp.clear(); + case TokenState::EscapedFromSpaced: { + // Previous character was "\". + arg.append(chr); + state = TokenState::InsideArgSpaced; + break; + } + + case TokenState::InsideArgSpaced: { + switch (chr.unicode()) { + case u'\\': + // We found escaped! + state = TokenState::EscapedFromSpaced; + continue; + + case u' ': + // We need to end this argument. + args.append(arg); + arg.clear(); + state = TokenState::Normal; + continue; + + default: + arg.append(chr); + break; + } + + break; + } + + case TokenState::InsideArgDoubleQuotes: { + switch (chr.unicode()) { + case u'\\': + // We found escaped! + state = TokenState::EscapedFromDoubleQuotes; + continue; + + case u'"': + // We need to end this argument. + args.append(arg); + arg.clear(); + state = TokenState::Normal; + continue; + + default: + arg.append(chr); + break; + } + + break; + } + + case TokenState::InsideArgSingleQuotes: { + switch (chr.unicode()) { + case u'\'': + // We need to end this argument. + args.append(arg); + arg.clear(); + state = TokenState::Normal; + continue; + + default: + arg.append(chr); + break; + } + + break; } - } - else { - tmp += command.at(i); } } - if (!tmp.isEmpty()) { - args += tmp; + switch (state) { + case TokenState::EscapedFromSpaced: + case TokenState::EscapedFromDoubleQuotes: + throw ApplicationException(QObject::tr("escape sequence not completed")); + break; + + case TokenState::InsideArgDoubleQuotes: + throw ApplicationException(QObject::tr("closing \" is missing")); + break; + + case TokenState::InsideArgSingleQuotes: + throw ApplicationException(QObject::tr("closing ' is missing")); + break; + + default: + break; } return args; diff --git a/src/librssguard/miscellaneous/textfactory.h b/src/librssguard/miscellaneous/textfactory.h index ea465dc7f..4ebdefa43 100644 --- a/src/librssguard/miscellaneous/textfactory.h +++ b/src/librssguard/miscellaneous/textfactory.h @@ -36,7 +36,7 @@ class TextFactory { static QString decrypt(const QString& text, quint64 key = 0); static QString newline(); static QString capitalizeFirstLetter(const QString& sts); - static QStringList tokenizeProcessArguments(QStringView args); + static QStringList tokenizeProcessArguments(const QString& command); // Shortens input string according to given length limit. static QString shorten(const QString& input, int text_length_limit = TEXT_TITLE_LIMIT); diff --git a/src/librssguard/services/standard/gui/standardfeeddetails.cpp b/src/librssguard/services/standard/gui/standardfeeddetails.cpp index 1253ce1c4..2d196eec2 100644 --- a/src/librssguard/services/standard/gui/standardfeeddetails.cpp +++ b/src/librssguard/services/standard/gui/standardfeeddetails.cpp @@ -7,6 +7,7 @@ #include "exceptions/networkexception.h" #include "exceptions/scriptexception.h" #include "miscellaneous/iconfactory.h" +#include "miscellaneous/textfactory.h" #include "network-web/networkfactory.h" #include "services/abstract/category.h" #include "services/standard/definitions.h" @@ -260,11 +261,12 @@ void StandardFeedDetails::onUrlChanged(const QString& new_url) { } } else if (sourceType() == StandardFeed::SourceType::Script) { - if (new_url.simplified().isEmpty()) { - m_ui.m_txtSource->setStatus(LineEditWithStatus::StatusType::Error, tr("The source is empty.")); + try { + TextFactory::tokenizeProcessArguments(new_url); + m_ui.m_txtSource->setStatus(LineEditWithStatus::StatusType::Ok, tr("Source is ok.")); } - else { - m_ui.m_txtSource->setStatus(LineEditWithStatus::StatusType::Ok, tr("The source is ok.")); + catch (const ApplicationException& ex) { + m_ui.m_txtSource->setStatus(LineEditWithStatus::StatusType::Error, tr("Error: %1").arg(ex.message())); } } else { @@ -273,11 +275,12 @@ void StandardFeedDetails::onUrlChanged(const QString& new_url) { } void StandardFeedDetails::onPostProcessScriptChanged(const QString& new_pp) { - if (QRegularExpression(QSL(SCRIPT_SOURCE_TYPE_REGEXP)).match(new_pp).hasMatch() || !new_pp.simplified().isEmpty()) { + try { + TextFactory::tokenizeProcessArguments(new_pp); m_ui.m_txtPostProcessScript->setStatus(LineEditWithStatus::StatusType::Ok, tr("Command is ok.")); } - else { - m_ui.m_txtPostProcessScript->setStatus(LineEditWithStatus::StatusType::Ok, tr("Command is empty.")); + catch (const ApplicationException& ex) { + m_ui.m_txtPostProcessScript->setStatus(LineEditWithStatus::StatusType::Error, tr("Error: %1").arg(ex.message())); } }