qBittorrent
rss_parser.cpp
Go to the documentation of this file.
1 /*
2  * Bittorrent Client using Qt and libtorrent.
3  * Copyright (C) 2015 Vladimir Golovnev <glassez@yandex.ru>
4  * Copyright (C) 2012 Christophe Dumez <chris@qbittorrent.org>
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19  *
20  * In addition, as a special exception, the copyright holders give permission to
21  * link this program with the OpenSSL project's "OpenSSL" library (or with
22  * modified versions of it that use the same license as the "OpenSSL" library),
23  * and distribute the linked executables. You must obey the GNU General Public
24  * License in all respects for all of the code used other than "OpenSSL". If you
25  * modify file(s), you may extend this exception to your version of the file(s),
26  * but you are not obligated to do so. If you do not wish to do so, delete this
27  * exception statement from your version.
28  */
29 
30 #include "rss_parser.h"
31 
32 #include <QDateTime>
33 #include <QDebug>
34 #include <QGlobalStatic>
35 #include <QHash>
36 #include <QMetaObject>
37 #include <QRegularExpression>
38 #include <QStringList>
39 #include <QVariant>
40 #include <QXmlStreamEntityResolver>
41 #include <QXmlStreamReader>
42 
43 #include "rss_article.h"
44 
45 namespace
46 {
47  class XmlStreamEntityResolver final : public QXmlStreamEntityResolver
48  {
49  public:
50  QString resolveUndeclaredEntity(const QString &name) override
51  {
52  // (X)HTML entities declared in:
53  // http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent
54  // http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent
55  // http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent
56  static const QHash<QString, QString> HTMLEntities
57  {
58  {"nbsp", "&#160;"}, // no-break space = non-breaking space, U+00A0 ISOnum
59  {"iexcl", "&#161;"}, // inverted exclamation mark, U+00A1 ISOnum
60  {"cent", "&#162;"}, // cent sign, U+00A2 ISOnum
61  {"pound", "&#163;"}, // pound sign, U+00A3 ISOnum
62  {"curren", "&#164;"}, // currency sign, U+00A4 ISOnum
63  {"yen", "&#165;"}, // yen sign = yuan sign, U+00A5 ISOnum
64  {"brvbar", "&#166;"}, // broken bar = broken vertical bar, U+00A6 ISOnum
65  {"sect", "&#167;"}, // section sign, U+00A7 ISOnum
66  {"uml", "&#168;"}, // diaeresis = spacing diaeresis, U+00A8 ISOdia
67  {"copy", "&#169;"}, // copyright sign, U+00A9 ISOnum
68  {"ordf", "&#170;"}, // feminine ordinal indicator, U+00AA ISOnum
69  {"laquo", "&#171;"}, // left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum
70  {"not", "&#172;"}, // not sign = angled dash, U+00AC ISOnum
71  {"shy", "&#173;"}, // soft hyphen = discretionary hyphen, U+00AD ISOnum
72  {"reg", "&#174;"}, // registered sign = registered trade mark sign, U+00AE ISOnum
73  {"macr", "&#175;"}, // macron = spacing macron = overline = APL overbar, U+00AF ISOdia
74  {"deg", "&#176;"}, // degree sign, U+00B0 ISOnum
75  {"plusmn", "&#177;"}, // plus-minus sign = plus-or-minus sign, U+00B1 ISOnum
76  {"sup2", "&#178;"}, // superscript two = superscript digit two = squared, U+00B2 ISOnum
77  {"sup3", "&#179;"}, // superscript three = superscript digit three = cubed, U+00B3 ISOnum
78  {"acute", "&#180;"}, // acute accent = spacing acute, U+00B4 ISOdia
79  {"micro", "&#181;"}, // micro sign, U+00B5 ISOnum
80  {"para", "&#182;"}, // pilcrow sign = paragraph sign, U+00B6 ISOnum
81  {"middot", "&#183;"}, // middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum
82  {"cedil", "&#184;"}, // cedilla = spacing cedilla, U+00B8 ISOdia
83  {"sup1", "&#185;"}, // superscript one = superscript digit one, U+00B9 ISOnum
84  {"ordm", "&#186;"}, // masculine ordinal indicator, U+00BA ISOnum
85  {"raquo", "&#187;"}, // right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum
86  {"frac14", "&#188;"}, // vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum
87  {"frac12", "&#189;"}, // vulgar fraction one half = fraction one half, U+00BD ISOnum
88  {"frac34", "&#190;"}, // vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum
89  {"iquest", "&#191;"}, // inverted question mark = turned question mark, U+00BF ISOnum
90  {"Agrave", "&#192;"}, // latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1
91  {"Aacute", "&#193;"}, // latin capital letter A with acute, U+00C1 ISOlat1
92  {"Acirc", "&#194;"}, // latin capital letter A with circumflex, U+00C2 ISOlat1
93  {"Atilde", "&#195;"}, // latin capital letter A with tilde, U+00C3 ISOlat1
94  {"Auml", "&#196;"}, // latin capital letter A with diaeresis, U+00C4 ISOlat1
95  {"Aring", "&#197;"}, // latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1
96  {"AElig", "&#198;"}, // latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
97  {"Ccedil", "&#199;"}, // latin capital letter C with cedilla, U+00C7 ISOlat1
98  {"Egrave", "&#200;"}, // latin capital letter E with grave, U+00C8 ISOlat1
99  {"Eacute", "&#201;"}, // latin capital letter E with acute, U+00C9 ISOlat1
100  {"Ecirc", "&#202;"}, // latin capital letter E with circumflex, U+00CA ISOlat1
101  {"Euml", "&#203;"}, // latin capital letter E with diaeresis, U+00CB ISOlat1
102  {"Igrave", "&#204;"}, // latin capital letter I with grave, U+00CC ISOlat1
103  {"Iacute", "&#205;"}, // latin capital letter I with acute, U+00CD ISOlat1
104  {"Icirc", "&#206;"}, // latin capital letter I with circumflex, U+00CE ISOlat1
105  {"Iuml", "&#207;"}, // latin capital letter I with diaeresis, U+00CF ISOlat1
106  {"ETH", "&#208;"}, // latin capital letter ETH, U+00D0 ISOlat1
107  {"Ntilde", "&#209;"}, // latin capital letter N with tilde, U+00D1 ISOlat1
108  {"Ograve", "&#210;"}, // latin capital letter O with grave, U+00D2 ISOlat1
109  {"Oacute", "&#211;"}, // latin capital letter O with acute, U+00D3 ISOlat1
110  {"Ocirc", "&#212;"}, // latin capital letter O with circumflex, U+00D4 ISOlat1
111  {"Otilde", "&#213;"}, // latin capital letter O with tilde, U+00D5 ISOlat1
112  {"Ouml", "&#214;"}, // latin capital letter O with diaeresis, U+00D6 ISOlat1
113  {"times", "&#215;"}, // multiplication sign, U+00D7 ISOnum
114  {"Oslash", "&#216;"}, // latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1
115  {"Ugrave", "&#217;"}, // latin capital letter U with grave, U+00D9 ISOlat1
116  {"Uacute", "&#218;"}, // latin capital letter U with acute, U+00DA ISOlat1
117  {"Ucirc", "&#219;"}, // latin capital letter U with circumflex, U+00DB ISOlat1
118  {"Uuml", "&#220;"}, // latin capital letter U with diaeresis, U+00DC ISOlat1
119  {"Yacute", "&#221;"}, // latin capital letter Y with acute, U+00DD ISOlat1
120  {"THORN", "&#222;"}, // latin capital letter THORN, U+00DE ISOlat1
121  {"szlig", "&#223;"}, // latin small letter sharp s = ess-zed, U+00DF ISOlat1
122  {"agrave", "&#224;"}, // latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1
123  {"aacute", "&#225;"}, // latin small letter a with acute, U+00E1 ISOlat1
124  {"acirc", "&#226;"}, // latin small letter a with circumflex, U+00E2 ISOlat1
125  {"atilde", "&#227;"}, // latin small letter a with tilde, U+00E3 ISOlat1
126  {"auml", "&#228;"}, // latin small letter a with diaeresis, U+00E4 ISOlat1
127  {"aring", "&#229;"}, // latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1
128  {"aelig", "&#230;"}, // latin small letter ae = latin small ligature ae, U+00E6 ISOlat1
129  {"ccedil", "&#231;"}, // latin small letter c with cedilla, U+00E7 ISOlat1
130  {"egrave", "&#232;"}, // latin small letter e with grave, U+00E8 ISOlat1
131  {"eacute", "&#233;"}, // latin small letter e with acute, U+00E9 ISOlat1
132  {"ecirc", "&#234;"}, // latin small letter e with circumflex, U+00EA ISOlat1
133  {"euml", "&#235;"}, // latin small letter e with diaeresis, U+00EB ISOlat1
134  {"igrave", "&#236;"}, // latin small letter i with grave, U+00EC ISOlat1
135  {"iacute", "&#237;"}, // latin small letter i with acute, U+00ED ISOlat1
136  {"icirc", "&#238;"}, // latin small letter i with circumflex, U+00EE ISOlat1
137  {"iuml", "&#239;"}, // latin small letter i with diaeresis, U+00EF ISOlat1
138  {"eth", "&#240;"}, // latin small letter eth, U+00F0 ISOlat1
139  {"ntilde", "&#241;"}, // latin small letter n with tilde, U+00F1 ISOlat1
140  {"ograve", "&#242;"}, // latin small letter o with grave, U+00F2 ISOlat1
141  {"oacute", "&#243;"}, // latin small letter o with acute, U+00F3 ISOlat1
142  {"ocirc", "&#244;"}, // latin small letter o with circumflex, U+00F4 ISOlat1
143  {"otilde", "&#245;"}, // latin small letter o with tilde, U+00F5 ISOlat1
144  {"ouml", "&#246;"}, // latin small letter o with diaeresis, U+00F6 ISOlat1
145  {"divide", "&#247;"}, // division sign, U+00F7 ISOnum
146  {"oslash", "&#248;"}, // latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1
147  {"ugrave", "&#249;"}, // latin small letter u with grave, U+00F9 ISOlat1
148  {"uacute", "&#250;"}, // latin small letter u with acute, U+00FA ISOlat1
149  {"ucirc", "&#251;"}, // latin small letter u with circumflex, U+00FB ISOlat1
150  {"uuml", "&#252;"}, // latin small letter u with diaeresis, U+00FC ISOlat1
151  {"yacute", "&#253;"}, // latin small letter y with acute, U+00FD ISOlat1
152  {"thorn", "&#254;"}, // latin small letter thorn, U+00FE ISOlat1
153  {"yuml", "&#255;"}, // latin small letter y with diaeresis, U+00FF ISOlat1
154 
155  // Latin Extended-A
156  {"OElig", "&#338;"}, // latin capital ligature OE, U+0152 ISOlat2
157  {"oelig", "&#339;"}, // latin small ligature oe, U+0153 ISOlat2
158  // ligature is a misnomer, this is a separate character in some languages
159  {"Scaron", "&#352;"}, // latin capital letter S with caron, U+0160 ISOlat2
160  {"scaron", "&#353;"}, // latin small letter s with caron, U+0161 ISOlat2
161  {"Yuml", "&#376;"}, // latin capital letter Y with diaeresis, U+0178 ISOlat2
162 
163  // Spacing Modifier Letters
164  {"circ", "&#710;"}, // modifier letter circumflex accent, U+02C6 ISOpub
165  {"tilde", "&#732;"}, // small tilde, U+02DC ISOdia
166 
167  // General Punctuation
168  {"ensp", "&#8194;"}, // en space, U+2002 ISOpub
169  {"emsp", "&#8195;"}, // em space, U+2003 ISOpub
170  {"thinsp", "&#8201;"}, // thin space, U+2009 ISOpub
171  {"zwnj", "&#8204;"}, // zero width non-joiner, U+200C NEW RFC 2070
172  {"zwj", "&#8205;"}, // zero width joiner, U+200D NEW RFC 2070
173  {"lrm", "&#8206;"}, // left-to-right mark, U+200E NEW RFC 2070
174  {"rlm", "&#8207;"}, // right-to-left mark, U+200F NEW RFC 2070
175  {"ndash", "&#8211;"}, // en dash, U+2013 ISOpub
176  {"mdash", "&#8212;"}, // em dash, U+2014 ISOpub
177  {"lsquo", "&#8216;"}, // left single quotation mark, U+2018 ISOnum
178  {"rsquo", "&#8217;"}, // right single quotation mark, U+2019 ISOnum
179  {"sbquo", "&#8218;"}, // single low-9 quotation mark, U+201A NEW
180  {"ldquo", "&#8220;"}, // left double quotation mark, U+201C ISOnum
181  {"rdquo", "&#8221;"}, // right double quotation mark, U+201D ISOnum
182  {"bdquo", "&#8222;"}, // double low-9 quotation mark, U+201E NEW
183  {"dagger", "&#8224;"}, // dagger, U+2020 ISOpub
184  {"Dagger", "&#8225;"}, // double dagger, U+2021 ISOpub
185  {"permil", "&#8240;"}, // per mille sign, U+2030 ISOtech
186  {"lsaquo", "&#8249;"}, // single left-pointing angle quotation mark, U+2039 ISO proposed
187  // lsaquo is proposed but not yet ISO standardized
188  {"rsaquo", "&#8250;"}, // single right-pointing angle quotation mark, U+203A ISO proposed
189  // rsaquo is proposed but not yet ISO standardized
190 
191  // Currency Symbols
192  {"euro", "&#8364;"}, // euro sign, U+20AC NEW
193 
194  // Latin Extended-B
195  {"fnof", "&#402;"}, // latin small letter f with hook = function = florin, U+0192 ISOtech
196 
197  // Greek
198  {"Alpha", "&#913;"}, // greek capital letter alpha, U+0391
199  {"Beta", "&#914;"}, // greek capital letter beta, U+0392
200  {"Gamma", "&#915;"}, // greek capital letter gamma, U+0393 ISOgrk3
201  {"Delta", "&#916;"}, // greek capital letter delta, U+0394 ISOgrk3
202  {"Epsilon", "&#917;"}, // greek capital letter epsilon, U+0395
203  {"Zeta", "&#918;"}, // greek capital letter zeta, U+0396
204  {"Eta", "&#919;"}, // greek capital letter eta, U+0397
205  {"Theta", "&#920;"}, // greek capital letter theta, U+0398 ISOgrk3
206  {"Iota", "&#921;"}, // greek capital letter iota, U+0399
207  {"Kappa", "&#922;"}, // greek capital letter kappa, U+039A
208  {"Lambda", "&#923;"}, // greek capital letter lamda, U+039B ISOgrk3
209  {"Mu", "&#924;"}, // greek capital letter mu, U+039C
210  {"Nu", "&#925;"}, // greek capital letter nu, U+039D
211  {"Xi", "&#926;"}, // greek capital letter xi, U+039E ISOgrk3
212  {"Omicron", "&#927;"}, // greek capital letter omicron, U+039F
213  {"Pi", "&#928;"}, // greek capital letter pi, U+03A0 ISOgrk3
214  {"Rho", "&#929;"}, // greek capital letter rho, U+03A1
215  {"Sigma", "&#931;"}, // greek capital letter sigma, U+03A3 ISOgrk3
216  {"Tau", "&#932;"}, // greek capital letter tau, U+03A4
217  {"Upsilon", "&#933;"}, // greek capital letter upsilon, U+03A5 ISOgrk3
218  {"Phi", "&#934;"}, // greek capital letter phi, U+03A6 ISOgrk3
219  {"Chi", "&#935;"}, // greek capital letter chi, U+03A7
220  {"Psi", "&#936;"}, // greek capital letter psi, U+03A8 ISOgrk3
221  {"Omega", "&#937;"}, // greek capital letter omega, U+03A9 ISOgrk3
222  {"alpha", "&#945;"}, // greek small letter alpha, U+03B1 ISOgrk3
223  {"beta", "&#946;"}, // greek small letter beta, U+03B2 ISOgrk3
224  {"gamma", "&#947;"}, // greek small letter gamma, U+03B3 ISOgrk3
225  {"delta", "&#948;"}, // greek small letter delta, U+03B4 ISOgrk3
226  {"epsilon", "&#949;"}, // greek small letter epsilon, U+03B5 ISOgrk3
227  {"zeta", "&#950;"}, // greek small letter zeta, U+03B6 ISOgrk3
228  {"eta", "&#951;"}, // greek small letter eta, U+03B7 ISOgrk3
229  {"theta", "&#952;"}, // greek small letter theta, U+03B8 ISOgrk3
230  {"iota", "&#953;"}, // greek small letter iota, U+03B9 ISOgrk3
231  {"kappa", "&#954;"}, // greek small letter kappa, U+03BA ISOgrk3
232  {"lambda", "&#955;"}, // greek small letter lamda, U+03BB ISOgrk3
233  {"mu", "&#956;"}, // greek small letter mu, U+03BC ISOgrk3
234  {"nu", "&#957;"}, // greek small letter nu, U+03BD ISOgrk3
235  {"xi", "&#958;"}, // greek small letter xi, U+03BE ISOgrk3
236  {"omicron", "&#959;"}, // greek small letter omicron, U+03BF NEW
237  {"pi", "&#960;"}, // greek small letter pi, U+03C0 ISOgrk3
238  {"rho", "&#961;"}, // greek small letter rho, U+03C1 ISOgrk3
239  {"sigmaf", "&#962;"}, // greek small letter final sigma, U+03C2 ISOgrk3
240  {"sigma", "&#963;"}, // greek small letter sigma, U+03C3 ISOgrk3
241  {"tau", "&#964;"}, // greek small letter tau, U+03C4 ISOgrk3
242  {"upsilon", "&#965;"}, // greek small letter upsilon, U+03C5 ISOgrk3
243  {"phi", "&#966;"}, // greek small letter phi, U+03C6 ISOgrk3
244  {"chi", "&#967;"}, // greek small letter chi, U+03C7 ISOgrk3
245  {"psi", "&#968;"}, // greek small letter psi, U+03C8 ISOgrk3
246  {"omega", "&#969;"}, // greek small letter omega, U+03C9 ISOgrk3
247  {"thetasym", "&#977;"}, // greek theta symbol, U+03D1 NEW
248  {"upsih", "&#978;"}, // greek upsilon with hook symbol, U+03D2 NEW
249  {"piv", "&#982;"}, // greek pi symbol, U+03D6 ISOgrk3
250 
251  // General Punctuation
252  {"bull", "&#8226;"}, // bullet = black small circle, U+2022 ISOpub
253  // bullet is NOT the same as bullet operator, U+2219
254  {"hellip", "&#8230;"}, // horizontal ellipsis = three dot leader, U+2026 ISOpub
255  {"prime", "&#8242;"}, // prime = minutes = feet, U+2032 ISOtech
256  {"Prime", "&#8243;"}, // double prime = seconds = inches, U+2033 ISOtech
257  {"oline", "&#8254;"}, // overline = spacing overscore, U+203E NEW
258  {"frasl", "&#8260;"}, // fraction slash, U+2044 NEW
259 
260  // Letterlike Symbols
261  {"weierp", "&#8472;"}, // script capital P = power set = Weierstrass p, U+2118 ISOamso
262  {"image", "&#8465;"}, // black-letter capital I = imaginary part, U+2111 ISOamso
263  {"real", "&#8476;"}, // black-letter capital R = real part symbol, U+211C ISOamso
264  {"trade", "&#8482;"}, // trade mark sign, U+2122 ISOnum
265  {"alefsym", "&#8501;"}, // alef symbol = first transfinite cardinal, U+2135 NEW
266  // alef symbol is NOT the same as hebrew letter alef,
267  // U+05D0 although the same glyph could be used to depict both characters
268 
269  // Arrows
270  {"larr", "&#8592;"}, // leftwards arrow, U+2190 ISOnum
271  {"uarr", "&#8593;"}, // upwards arrow, U+2191 ISOnum
272  {"rarr", "&#8594;"}, // rightwards arrow, U+2192 ISOnum
273  {"darr", "&#8595;"}, // downwards arrow, U+2193 ISOnum
274  {"harr", "&#8596;"}, // left right arrow, U+2194 ISOamsa
275  {"crarr", "&#8629;"}, // downwards arrow with corner leftwards = carriage return, U+21B5 NEW
276  {"lArr", "&#8656;"}, // leftwards double arrow, U+21D0 ISOtech
277  // Unicode does not say that lArr is the same as the 'is implied by' arrow
278  // but also does not have any other character for that function. So lArr can
279  // be used for 'is implied by' as ISOtech suggests
280  {"uArr", "&#8657;"}, // upwards double arrow, U+21D1 ISOamsa
281  {"rArr", "&#8658;"}, // rightwards double arrow, U+21D2 ISOtech
282  // Unicode does not say this is the 'implies' character but does not have
283  // another character with this function so rArr can be used for 'implies'
284  // as ISOtech suggests
285  {"dArr", "&#8659;"}, // downwards double arrow, U+21D3 ISOamsa
286  {"hArr", "&#8660;"}, // left right double arrow, U+21D4 ISOamsa
287 
288  // Mathematical Operators
289  {"forall", "&#8704;"}, // for all, U+2200 ISOtech
290  {"part", "&#8706;"}, // partial differential, U+2202 ISOtech
291  {"exist", "&#8707;"}, // there exists, U+2203 ISOtech
292  {"empty", "&#8709;"}, // empty set = null set, U+2205 ISOamso
293  {"nabla", "&#8711;"}, // nabla = backward difference, U+2207 ISOtech
294  {"isin", "&#8712;"}, // element of, U+2208 ISOtech
295  {"notin", "&#8713;"}, // not an element of, U+2209 ISOtech
296  {"ni", "&#8715;"}, // contains as member, U+220B ISOtech
297  {"prod", "&#8719;"}, // n-ary product = product sign, U+220F ISOamsb
298  // prod is NOT the same character as U+03A0 'greek capital letter pi' though
299  // the same glyph might be used for both
300  {"sum", "&#8721;"}, // n-ary summation, U+2211 ISOamsb
301  // sum is NOT the same character as U+03A3 'greek capital letter sigma'
302  // though the same glyph might be used for both
303  {"minus", "&#8722;"}, // minus sign, U+2212 ISOtech
304  {"lowast", "&#8727;"}, // asterisk operator, U+2217 ISOtech
305  {"radic", "&#8730;"}, // square root = radical sign, U+221A ISOtech
306  {"prop", "&#8733;"}, // proportional to, U+221D ISOtech
307  {"infin", "&#8734;"}, // infinity, U+221E ISOtech
308  {"ang", "&#8736;"}, // angle, U+2220 ISOamso
309  {"and", "&#8743;"}, // logical and = wedge, U+2227 ISOtech
310  {"or", "&#8744;"}, // logical or = vee, U+2228 ISOtech
311  {"cap", "&#8745;"}, // intersection = cap, U+2229 ISOtech
312  {"cup", "&#8746;"}, // union = cup, U+222A ISOtech
313  {"int", "&#8747;"}, // integral, U+222B ISOtech
314  {"there4", "&#8756;"}, // therefore, U+2234 ISOtech
315  {"sim", "&#8764;"}, // tilde operator = varies with = similar to, U+223C ISOtech
316  // tilde operator is NOT the same character as the tilde, U+007E,
317  // although the same glyph might be used to represent both
318  {"cong", "&#8773;"}, // approximately equal to, U+2245 ISOtech
319  {"asymp", "&#8776;"}, // almost equal to = asymptotic to, U+2248 ISOamsr
320  {"ne", "&#8800;"}, // not equal to, U+2260 ISOtech
321  {"equiv", "&#8801;"}, // identical to, U+2261 ISOtech
322  {"le", "&#8804;"}, // less-than or equal to, U+2264 ISOtech
323  {"ge", "&#8805;"}, // greater-than or equal to, U+2265 ISOtech
324  {"sub", "&#8834;"}, // subset of, U+2282 ISOtech
325  {"sup", "&#8835;"}, // superset of, U+2283 ISOtech
326  {"nsub", "&#8836;"}, // not a subset of, U+2284 ISOamsn
327  {"sube", "&#8838;"}, // subset of or equal to, U+2286 ISOtech
328  {"supe", "&#8839;"}, // superset of or equal to, U+2287 ISOtech
329  {"oplus", "&#8853;"}, // circled plus = direct sum, U+2295 ISOamsb
330  {"otimes", "&#8855;"}, // circled times = vector product, U+2297 ISOamsb
331  {"perp", "&#8869;"}, // up tack = orthogonal to = perpendicular, U+22A5 ISOtech
332  {"sdot", "&#8901;"}, // dot operator, U+22C5 ISOamsb
333  // dot operator is NOT the same character as U+00B7 middle dot
334 
335  // Miscellaneous Technical
336  {"lceil", "&#8968;"}, // left ceiling = APL upstile, U+2308 ISOamsc
337  {"rceil", "&#8969;"}, // right ceiling, U+2309 ISOamsc
338  {"lfloor", "&#8970;"}, // left floor = APL downstile, U+230A ISOamsc
339  {"rfloor", "&#8971;"}, // right floor, U+230B ISOamsc
340  {"lang", "&#9001;"}, // left-pointing angle bracket = bra, U+2329 ISOtech
341  // lang is NOT the same character as U+003C 'less than sign'
342  // or U+2039 'single left-pointing angle quotation mark'
343  {"rang", "&#9002;"}, // right-pointing angle bracket = ket, U+232A ISOtech
344  // rang is NOT the same character as U+003E 'greater than sign'
345  // or U+203A 'single right-pointing angle quotation mark'
346 
347  // Geometric Shapes
348  {"loz", "&#9674;"}, // lozenge, U+25CA ISOpub
349 
350  // Miscellaneous Symbols
351  {"spades", "&#9824;"}, // black spade suit, U+2660 ISOpub
352  {"clubs", "&#9827;"}, // black club suit = shamrock, U+2663 ISOpub
353  {"hearts", "&#9829;"}, // black heart suit = valentine, U+2665 ISOpub
354  {"diams", "&#9830;"} // black diamond suit, U+2666 ISOpub
355  };
356  return HTMLEntities.value(name);
357  }
358  };
359 
360  // Ported to Qt from KDElibs4
361  QDateTime parseDate(const QString &string)
362  {
363  const char shortDay[][4] =
364  {
365  "Mon", "Tue", "Wed",
366  "Thu", "Fri", "Sat",
367  "Sun"
368  };
369  const char longDay[][10] =
370  {
371  "Monday", "Tuesday", "Wednesday",
372  "Thursday", "Friday", "Saturday",
373  "Sunday"
374  };
375  const char shortMonth[][4] =
376  {
377  "Jan", "Feb", "Mar", "Apr",
378  "May", "Jun", "Jul", "Aug",
379  "Sep", "Oct", "Nov", "Dec"
380  };
381 
382  const QString str = string.trimmed();
383  if (str.isEmpty())
384  return QDateTime::currentDateTime();
385 
386  int nyear = 6; // indexes within string to values
387  int nmonth = 4;
388  int nday = 2;
389  int nwday = 1;
390  int nhour = 7;
391  int nmin = 8;
392  int nsec = 9;
393  // Also accept obsolete form "Weekday, DD-Mon-YY HH:MM:SS ±hhmm"
394  QRegularExpression rx {"^(?:([A-Z][a-z]+),\\s*)?(\\d{1,2})(\\s+|-)([^-\\s]+)(\\s+|-)(\\d{2,4})\\s+(\\d\\d):(\\d\\d)(?::(\\d\\d))?\\s+(\\S+)$"};
395  QRegularExpressionMatch rxMatch;
396  QStringList parts;
397  if (str.indexOf(rx, 0, &rxMatch) == 0)
398  {
399  // Check that if date has '-' separators, both separators are '-'.
400  parts = rxMatch.capturedTexts();
401  const bool h1 = (parts[3] == QLatin1String("-"));
402  const bool h2 = (parts[5] == QLatin1String("-"));
403  if (h1 != h2)
404  return QDateTime::currentDateTime();
405  }
406  else
407  {
408  // Check for the obsolete form "Wdy Mon DD HH:MM:SS YYYY"
409  rx = QRegularExpression {"^([A-Z][a-z]+)\\s+(\\S+)\\s+(\\d\\d)\\s+(\\d\\d):(\\d\\d):(\\d\\d)\\s+(\\d\\d\\d\\d)$"};
410  if (str.indexOf(rx, 0, &rxMatch) != 0)
411  return QDateTime::currentDateTime();
412 
413  nyear = 7;
414  nmonth = 2;
415  nday = 3;
416  nwday = 1;
417  nhour = 4;
418  nmin = 5;
419  nsec = 6;
420  parts = rxMatch.capturedTexts();
421  }
422 
423  bool ok[4];
424  const int day = parts[nday].toInt(&ok[0]);
425  int year = parts[nyear].toInt(&ok[1]);
426  const int hour = parts[nhour].toInt(&ok[2]);
427  const int minute = parts[nmin].toInt(&ok[3]);
428  if (!ok[0] || !ok[1] || !ok[2] || !ok[3])
429  return QDateTime::currentDateTime();
430 
431  int second = 0;
432  if (!parts[nsec].isEmpty())
433  {
434  second = parts[nsec].toInt(&ok[0]);
435  if (!ok[0])
436  return QDateTime::currentDateTime();
437  }
438 
439  const bool leapSecond = (second == 60);
440  if (leapSecond)
441  second = 59; // apparently a leap second - validate below, once time zone is known
442  int month = 0;
443  for ( ; (month < 12) && (parts[nmonth] != shortMonth[month]); ++month);
444  int dayOfWeek = -1;
445  if (!parts[nwday].isEmpty())
446  {
447  // Look up the weekday name
448  while ((++dayOfWeek < 7) && (shortDay[dayOfWeek] != parts[nwday]));
449  if (dayOfWeek >= 7)
450  for (dayOfWeek = 0; (dayOfWeek < 7) && (longDay[dayOfWeek] != parts[nwday]); ++dayOfWeek);
451  }
452 
453  // if (month >= 12 || dayOfWeek >= 7
454  // || (dayOfWeek < 0 && format == RFCDateDay))
455  // return QDateTime;
456  const int i = parts[nyear].size();
457  if (i < 4)
458  {
459  // It's an obsolete year specification with less than 4 digits
460  year += ((i == 2) && (year < 50)) ? 2000 : 1900;
461  }
462 
463  // Parse the UTC offset part
464  int offset = 0; // set default to '-0000'
465  bool negOffset = false;
466  if (parts.count() > 10)
467  {
468  rx = QRegularExpression {"^([+-])(\\d\\d)(\\d\\d)$"};
469  if (parts[10].indexOf(rx, 0, &rxMatch) == 0)
470  {
471  // It's a UTC offset ±hhmm
472  parts = rxMatch.capturedTexts();
473  offset = parts[2].toInt(&ok[0]) * 3600;
474  const int offsetMin = parts[3].toInt(&ok[1]);
475  if (!ok[0] || !ok[1] || offsetMin > 59)
476  return {};
477  offset += offsetMin * 60;
478  negOffset = (parts[1] == QLatin1String("-"));
479  if (negOffset)
480  offset = -offset;
481  }
482  else
483  {
484  // Check for an obsolete time zone name
485  const QByteArray zone = parts[10].toLatin1();
486  if ((zone.length() == 1) && (isalpha(zone[0])) && (toupper(zone[0]) != 'J'))
487  {
488  negOffset = true; // military zone: RFC 2822 treats as '-0000'
489  }
490  else if ((zone != "UT") && (zone != "GMT"))
491  { // treated as '+0000'
492  offset = (zone == "EDT")
493  ? -4 * 3600
494  : ((zone == "EST") || (zone == "CDT"))
495  ? -5 * 3600
496  : ((zone == "CST") || (zone == "MDT"))
497  ? -6 * 3600
498  : ((zone == "MST") || (zone == "PDT"))
499  ? -7 * 3600
500  : (zone == "PST")
501  ? -8 * 3600
502  : 0;
503  if (!offset)
504  {
505  // Check for any other alphabetic time zone
506  bool nonalpha = false;
507  for (int i = 0, end = zone.size(); (i < end) && !nonalpha; ++i)
508  nonalpha = !isalpha(zone[i]);
509  if (nonalpha)
510  return {};
511  // TODO: Attempt to recognize the time zone abbreviation?
512  negOffset = true; // unknown time zone: RFC 2822 treats as '-0000'
513  }
514  }
515  }
516  }
517 
518  const QDate qDate(year, month + 1, day); // convert date, and check for out-of-range
519  if (!qDate.isValid())
520  return QDateTime::currentDateTime();
521 
522  const QTime qTime(hour, minute, second);
523  QDateTime result(qDate, qTime, Qt::UTC);
524  if (offset)
525  result = result.addSecs(-offset);
526  if (!result.isValid())
527  return QDateTime::currentDateTime(); // invalid date/time
528 
529  if (leapSecond)
530  {
531  // Validate a leap second time. Leap seconds are inserted after 23:59:59 UTC.
532  // Convert the time to UTC and check that it is 00:00:00.
533  if ((hour*3600 + minute*60 + 60 - offset + 86400*5) % 86400) // (max abs(offset) is 100 hours)
534  return QDateTime::currentDateTime(); // the time isn't the last second of the day
535  }
536 
537  return result;
538  }
539 }
540 
541 using namespace RSS::Private;
542 
543 const int ParsingResultTypeId = qRegisterMetaType<ParsingResult>();
544 
545 Parser::Parser(const QString lastBuildDate)
546 {
547  m_result.lastBuildDate = lastBuildDate;
548 }
549 
550 void Parser::parse(const QByteArray &feedData)
551 {
552  QMetaObject::invokeMethod(this, [this, feedData]() { parse_impl(feedData); }
553  , Qt::QueuedConnection);
554 }
555 
556 // read and create items from a rss document
557 void Parser::parse_impl(const QByteArray &feedData)
558 {
559  QXmlStreamReader xml(feedData);
560  XmlStreamEntityResolver resolver;
561  xml.setEntityResolver(&resolver);
562  bool foundChannel = false;
563 
564  while (xml.readNextStartElement())
565  {
566  if (xml.name() == QLatin1String("rss"))
567  {
568  // Find channels
569  while (xml.readNextStartElement())
570  {
571  if (xml.name() == QLatin1String("channel"))
572  {
573  parseRSSChannel(xml);
574  foundChannel = true;
575  break;
576  }
577 
578  qDebug() << "Skip rss item: " << xml.name();
579  xml.skipCurrentElement();
580  }
581  break;
582  }
583  if (xml.name() == QLatin1String("feed"))
584  { // Atom feed
585  parseAtomChannel(xml);
586  foundChannel = true;
587  break;
588  }
589 
590  qDebug() << "Skip root item: " << xml.name();
591  xml.skipCurrentElement();
592  }
593 
594  if (!foundChannel)
595  {
596  m_result.error = tr("Invalid RSS feed.");
597  }
598  else if (xml.hasError())
599  {
600  m_result.error = tr("%1 (line: %2, column: %3, offset: %4).")
601  .arg(xml.errorString()).arg(xml.lineNumber())
602  .arg(xml.columnNumber()).arg(xml.characterOffset());
603  }
604 
605  emit finished(m_result);
606  m_result.articles.clear(); // clear articles only
607  m_articleIDs.clear();
608 }
609 
610 void Parser::parseRssArticle(QXmlStreamReader &xml)
611 {
612  QVariantHash article;
613  QString altTorrentUrl;
614 
615  while (!xml.atEnd())
616  {
617  xml.readNext();
618  const QString name(xml.name().toString());
619 
620  if (xml.isEndElement() && (name == QLatin1String("item")))
621  break;
622 
623  if (xml.isStartElement())
624  {
625  if (name == QLatin1String("title"))
626  {
627  article[Article::KeyTitle] = xml.readElementText().trimmed();
628  }
629  else if (name == QLatin1String("enclosure"))
630  {
631  if (xml.attributes().value("type") == QLatin1String("application/x-bittorrent"))
632  article[Article::KeyTorrentURL] = xml.attributes().value(QLatin1String("url")).toString();
633  else if (xml.attributes().value("type").isEmpty())
634  altTorrentUrl = xml.attributes().value(QLatin1String("url")).toString();
635  }
636  else if (name == QLatin1String("link"))
637  {
638  const QString text {xml.readElementText().trimmed()};
639  if (text.startsWith(QLatin1String("magnet:"), Qt::CaseInsensitive))
640  article[Article::KeyTorrentURL] = text; // magnet link instead of a news URL
641  else
642  article[Article::KeyLink] = text;
643  }
644  else if (name == QLatin1String("description"))
645  {
646  article[Article::KeyDescription] = xml.readElementText(QXmlStreamReader::IncludeChildElements);
647  }
648  else if (name == QLatin1String("pubDate"))
649  {
650  article[Article::KeyDate] = parseDate(xml.readElementText().trimmed());
651  }
652  else if (name == QLatin1String("author"))
653  {
654  article[Article::KeyAuthor] = xml.readElementText().trimmed();
655  }
656  else if (name == QLatin1String("guid"))
657  {
658  article[Article::KeyId] = xml.readElementText().trimmed();
659  }
660  else
661  {
662  article[name] = xml.readElementText(QXmlStreamReader::IncludeChildElements);
663  }
664  }
665  }
666 
667  if (article[Article::KeyTorrentURL].toString().isEmpty())
668  article[Article::KeyTorrentURL] = altTorrentUrl;
669 
670  addArticle(article);
671 }
672 
673 void Parser::parseRSSChannel(QXmlStreamReader &xml)
674 {
675  while (!xml.atEnd())
676  {
677  xml.readNext();
678 
679  if (xml.isStartElement())
680  {
681  if (xml.name() == QLatin1String("title"))
682  {
683  m_result.title = xml.readElementText();
684  }
685  else if (xml.name() == QLatin1String("lastBuildDate"))
686  {
687  const QString lastBuildDate = xml.readElementText();
688  if (!lastBuildDate.isEmpty())
689  {
690  if (m_result.lastBuildDate == lastBuildDate)
691  {
692  qDebug() << "The RSS feed has not changed since last time, aborting parsing.";
693  return;
694  }
695  m_result.lastBuildDate = lastBuildDate;
696  }
697  }
698  else if (xml.name() == QLatin1String("item"))
699  {
700  parseRssArticle(xml);
701  }
702  }
703  }
704 }
705 
706 void Parser::parseAtomArticle(QXmlStreamReader &xml)
707 {
708  QVariantHash article;
709  bool doubleContent = false;
710 
711  while (!xml.atEnd())
712  {
713  xml.readNext();
714  const QString name(xml.name().toString());
715 
716  if (xml.isEndElement() && (name == QLatin1String("entry")))
717  break;
718 
719  if (xml.isStartElement())
720  {
721  if (name == QLatin1String("title"))
722  {
723  article[Article::KeyTitle] = xml.readElementText().trimmed();
724  }
725  else if (name == QLatin1String("link"))
726  {
727  const QString link = (xml.attributes().isEmpty()
728  ? xml.readElementText().trimmed()
729  : xml.attributes().value(QLatin1String("href")).toString());
730 
731  if (link.startsWith(QLatin1String("magnet:"), Qt::CaseInsensitive))
732  article[Article::KeyTorrentURL] = link; // magnet link instead of a news URL
733  else
734  // Atom feeds can have relative links, work around this and
735  // take the stress of figuring article full URI from UI
736  // Assemble full URI
737  article[Article::KeyLink] = (m_baseUrl.isEmpty() ? link : m_baseUrl + link);
738 
739  }
740  else if ((name == QLatin1String("summary")) || (name == QLatin1String("content")))
741  {
742  if (doubleContent)
743  { // Duplicate content -> ignore
744  xml.skipCurrentElement();
745  continue;
746  }
747 
748  // Try to also parse broken articles, which don't use html '&' escapes
749  // Actually works great for non-broken content too
750  const QString feedText = xml.readElementText(QXmlStreamReader::IncludeChildElements).trimmed();
751  if (!feedText.isEmpty())
752  {
753  article[Article::KeyDescription] = feedText;
754  doubleContent = true;
755  }
756  }
757  else if (name == QLatin1String("updated"))
758  {
759  // ATOM uses standard compliant date, don't do fancy stuff
760  const QDateTime articleDate = QDateTime::fromString(xml.readElementText().trimmed(), Qt::ISODate);
761  article[Article::KeyDate] = (articleDate.isValid() ? articleDate : QDateTime::currentDateTime());
762  }
763  else if (name == QLatin1String("author"))
764  {
765  while (xml.readNextStartElement())
766  {
767  if (xml.name() == QLatin1String("name"))
768  article[Article::KeyAuthor] = xml.readElementText().trimmed();
769  else
770  xml.skipCurrentElement();
771  }
772  }
773  else if (name == QLatin1String("id"))
774  {
775  article[Article::KeyId] = xml.readElementText().trimmed();
776  }
777  else
778  {
779  article[name] = xml.readElementText(QXmlStreamReader::IncludeChildElements);
780  }
781  }
782  }
783 
784  addArticle(article);
785 }
786 
787 void Parser::parseAtomChannel(QXmlStreamReader &xml)
788 {
789  m_baseUrl = xml.attributes().value("xml:base").toString();
790 
791  while (!xml.atEnd())
792  {
793  xml.readNext();
794 
795  if (xml.isStartElement())
796  {
797  if (xml.name() == QLatin1String("title"))
798  {
799  m_result.title = xml.readElementText();
800  }
801  else if (xml.name() == QLatin1String("updated"))
802  {
803  const QString lastBuildDate = xml.readElementText();
804  if (!lastBuildDate.isEmpty())
805  {
806  if (m_result.lastBuildDate == lastBuildDate)
807  {
808  qDebug() << "The RSS feed has not changed since last time, aborting parsing.";
809  return;
810  }
811  m_result.lastBuildDate = lastBuildDate;
812  }
813  }
814  else if (xml.name() == QLatin1String("entry"))
815  {
816  parseAtomArticle(xml);
817  }
818  }
819  }
820 }
821 
822 void Parser::addArticle(QVariantHash article)
823 {
824  QVariant &torrentURL = article[Article::KeyTorrentURL];
825  if (torrentURL.toString().isEmpty())
826  torrentURL = article.value(Article::KeyLink);
827 
828  // If item does not have an ID, fall back to some other identifier.
829  QVariant &localId = article[Article::KeyId];
830  if (localId.toString().isEmpty())
831  {
832  localId = article.value(Article::KeyTorrentURL);
833  if (localId.toString().isEmpty())
834  {
835  localId = article.value(Article::KeyTitle);
836  if (localId.toString().isEmpty())
837  {
838  // The article could not be uniquely identified
839  // since it has no appropriate data.
840  // Just ignore it.
841  return;
842  }
843  }
844  }
845 
846  if (m_articleIDs.contains(localId.toString()))
847  {
848  // The article could not be uniquely identified
849  // since the Feed has duplicate identifiers.
850  // Just ignore it.
851  return;
852  }
853 
854  m_articleIDs.insert(localId.toString());
855  m_result.articles.prepend(article);
856 }
static const QString KeyAuthor
Definition: rss_article.h:56
static const QString KeyDate
Definition: rss_article.h:54
static const QString KeyDescription
Definition: rss_article.h:57
static const QString KeyLink
Definition: rss_article.h:59
static const QString KeyId
Definition: rss_article.h:53
static const QString KeyTitle
Definition: rss_article.h:55
static const QString KeyTorrentURL
Definition: rss_article.h:58
ParsingResult m_result
Definition: rss_parser.h:72
void parse(const QByteArray &feedData)
Definition: rss_parser.cpp:550
Q_INVOKABLE void parse_impl(const QByteArray &feedData)
Definition: rss_parser.cpp:557
void addArticle(QVariantHash article)
Definition: rss_parser.cpp:822
Parser(QString lastBuildDate)
Definition: rss_parser.cpp:545
void parseRSSChannel(QXmlStreamReader &xml)
Definition: rss_parser.cpp:673
void parseAtomChannel(QXmlStreamReader &xml)
Definition: rss_parser.cpp:787
void parseRssArticle(QXmlStreamReader &xml)
Definition: rss_parser.cpp:610
void finished(const RSS::Private::ParsingResult &result)
void parseAtomArticle(QXmlStreamReader &xml)
Definition: rss_parser.cpp:706
QSet< QString > m_articleIDs
Definition: rss_parser.h:73
QString resolveUndeclaredEntity(const QString &name) override
Definition: rss_parser.cpp:50
QDateTime parseDate(const QString &string)
Definition: rss_parser.cpp:361
QString toString(const lt::socket_type_t socketType)
Definition: session.cpp:183
const int ParsingResultTypeId
Definition: rss_parser.cpp:543
QList< QVariantHash > articles
Definition: rss_parser.h:49