core: fix invalid characters in xml/rss. resolves #9118 (#9636)

This commit is contained in:
Diego Heras
2020-09-25 02:40:13 +02:00
committed by GitHub
parent 61eb75f7e1
commit a1108bc5a2
2 changed files with 133 additions and 29 deletions

View File

@@ -2,6 +2,7 @@ using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Globalization; using System.Globalization;
using System.Linq; using System.Linq;
using System.Text.RegularExpressions;
using System.Threading; using System.Threading;
using System.Xml.Linq; using System.Xml.Linq;
@@ -9,10 +10,16 @@ namespace Jackett.Common.Models
{ {
public class ResultPage public class ResultPage
{ {
private static readonly XNamespace atomNs = "http://www.w3.org/2005/Atom"; private static readonly XNamespace _AtomNs = "http://www.w3.org/2005/Atom";
private static readonly XNamespace torznabNs = "http://torznab.com/schemas/2015/feed"; private static readonly XNamespace _TorznabNs = "http://torznab.com/schemas/2015/feed";
public ChannelInfo ChannelInfo { get; private set; } // filters control characters but allows only properly-formed surrogate sequences
// https://stackoverflow.com/a/961504
private static readonly Regex _InvalidXmlChars = new Regex(
@"(?<![\uD800-\uDBFF])[\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F\uFEFF\uFFFE\uFFFF]",
RegexOptions.Compiled);
private ChannelInfo ChannelInfo { get; }
public IEnumerable<ReleaseInfo> Releases { get; set; } public IEnumerable<ReleaseInfo> Releases { get; set; }
public ResultPage(ChannelInfo channelInfo) public ResultPage(ChannelInfo channelInfo)
@@ -21,15 +28,29 @@ namespace Jackett.Common.Models
Releases = new List<ReleaseInfo>(); Releases = new List<ReleaseInfo>();
} }
private string xmlDateFormat(DateTime dt) /// <summary>
/// removes any unusual unicode characters that can't be encoded into XML (eg 0x1A)
/// </summary>
private static string RemoveInvalidXMLChars(string text)
{
if (text == null)
return null;
return _InvalidXmlChars.Replace(text, "");
}
private static string XmlDateFormat(DateTime dt)
{ {
Thread.CurrentThread.CurrentCulture = new CultureInfo("en-US"); Thread.CurrentThread.CurrentCulture = new CultureInfo("en-US");
//Sat, 14 Mar 2015 17:10:42 -0400 //Sat, 14 Mar 2015 17:10:42 -0400
var f = string.Format(@"{0:ddd, dd MMM yyyy HH:mm:ss }{1}", dt, string.Format("{0:zzz}", dt).Replace(":", "")); return $"{dt:ddd, dd MMM yyyy HH:mm:ss} " + $"{dt:zzz}".Replace(":", "");
return f;
} }
private XElement getTorznabElement(string name, object value) => value == null ? null : new XElement(torznabNs + "attr", new XAttribute("name", name), new XAttribute("value", value)); private static XElement GetTorznabElement(string name, object value)
{
if (value == null)
return null;
return new XElement(_TorznabNs + "attr", new XAttribute("name", name), new XAttribute("value", value));
}
public string ToXml(Uri selfAtom) public string ToXml(Uri selfAtom)
{ {
@@ -39,10 +60,10 @@ namespace Jackett.Common.Models
new XDeclaration("1.0", "UTF-8", null), new XDeclaration("1.0", "UTF-8", null),
new XElement("rss", new XElement("rss",
new XAttribute("version", "1.0"), new XAttribute("version", "1.0"),
new XAttribute(XNamespace.Xmlns + "atom", atomNs.NamespaceName), new XAttribute(XNamespace.Xmlns + "atom", _AtomNs.NamespaceName),
new XAttribute(XNamespace.Xmlns + "torznab", torznabNs.NamespaceName), new XAttribute(XNamespace.Xmlns + "torznab", _TorznabNs.NamespaceName),
new XElement("channel", new XElement("channel",
new XElement(atomNs + "link", new XElement(_AtomNs + "link",
new XAttribute("href", selfAtom.AbsoluteUri), new XAttribute("href", selfAtom.AbsoluteUri),
new XAttribute("rel", "self"), new XAttribute("rel", "self"),
new XAttribute("type", "application/rss+xml") new XAttribute("type", "application/rss+xml")
@@ -60,15 +81,15 @@ namespace Jackett.Common.Models
), ),
from r in Releases from r in Releases
select new XElement("item", select new XElement("item",
new XElement("title", r.Title), new XElement("title", RemoveInvalidXMLChars(r.Title)),
new XElement("guid", r.Guid.AbsoluteUri), // GUID and (Link or Magnet) are mandatory new XElement("guid", r.Guid.AbsoluteUri), // GUID and (Link or Magnet) are mandatory
new XElement("jackettindexer", new XAttribute("id", r.Origin.Id), r.Origin.DisplayName), new XElement("jackettindexer", new XAttribute("id", r.Origin.Id), r.Origin.DisplayName),
r.Comments == null ? null : new XElement("comments", r.Comments.AbsoluteUri), r.Comments == null ? null : new XElement("comments", r.Comments.AbsoluteUri),
r.PublishDate == DateTime.MinValue ? new XElement("pubDate", xmlDateFormat(DateTime.Now)) : new XElement("pubDate", xmlDateFormat(r.PublishDate)), r.PublishDate == DateTime.MinValue ? new XElement("pubDate", XmlDateFormat(DateTime.Now)) : new XElement("pubDate", XmlDateFormat(r.PublishDate)),
r.Size == null ? null : new XElement("size", r.Size), r.Size == null ? null : new XElement("size", r.Size),
r.Files == null ? null : new XElement("files", r.Files), r.Files == null ? null : new XElement("files", r.Files),
r.Grabs == null ? null : new XElement("grabs", r.Grabs), r.Grabs == null ? null : new XElement("grabs", r.Grabs),
new XElement("description", r.Description), new XElement("description", RemoveInvalidXMLChars(r.Description)),
new XElement("link", r.Link?.AbsoluteUri ?? r.MagnetUri.AbsoluteUri), new XElement("link", r.Link?.AbsoluteUri ?? r.MagnetUri.AbsoluteUri),
r.Category == null ? null : from c in r.Category select new XElement("category", c), r.Category == null ? null : from c in r.Category select new XElement("category", c),
new XElement( new XElement(
@@ -77,27 +98,27 @@ namespace Jackett.Common.Models
r.Size == null ? null : new XAttribute("length", r.Size), r.Size == null ? null : new XAttribute("length", r.Size),
new XAttribute("type", "application/x-bittorrent") new XAttribute("type", "application/x-bittorrent")
), ),
r.Category == null ? null : from c in r.Category select getTorznabElement("category", c), r.Category == null ? null : from c in r.Category select GetTorznabElement("category", c),
getTorznabElement("magneturl", r.MagnetUri?.AbsoluteUri), GetTorznabElement("magneturl", r.MagnetUri?.AbsoluteUri),
getTorznabElement("rageid", r.RageID), GetTorznabElement("rageid", r.RageID),
getTorznabElement("thetvdb", r.TVDBId), GetTorznabElement("thetvdb", r.TVDBId),
getTorznabElement("imdb", r.Imdb == null ? null : ((long)r.Imdb).ToString("D7")), GetTorznabElement("imdb", r.Imdb?.ToString("D7")),
getTorznabElement("tmdb", r.TMDb), GetTorznabElement("tmdb", r.TMDb),
getTorznabElement("author", r.Author), GetTorznabElement("author", RemoveInvalidXMLChars(r.Author)),
getTorznabElement("booktitle", r.BookTitle), GetTorznabElement("booktitle", RemoveInvalidXMLChars(r.BookTitle)),
getTorznabElement("seeders", r.Seeders), GetTorznabElement("seeders", r.Seeders),
getTorznabElement("peers", r.Peers), GetTorznabElement("peers", r.Peers),
getTorznabElement("infohash", r.InfoHash), GetTorznabElement("infohash", RemoveInvalidXMLChars(r.InfoHash)),
getTorznabElement("minimumratio", r.MinimumRatio), GetTorznabElement("minimumratio", r.MinimumRatio),
getTorznabElement("minimumseedtime", r.MinimumSeedTime), GetTorznabElement("minimumseedtime", r.MinimumSeedTime),
getTorznabElement("downloadvolumefactor", r.DownloadVolumeFactor), GetTorznabElement("downloadvolumefactor", r.DownloadVolumeFactor),
getTorznabElement("uploadvolumefactor", r.UploadVolumeFactor) GetTorznabElement("uploadvolumefactor", r.UploadVolumeFactor)
) )
) )
) )
); );
return xdoc.Declaration.ToString() + Environment.NewLine + xdoc.ToString(); return xdoc.Declaration + Environment.NewLine + xdoc;
} }
} }
} }

View File

@@ -0,0 +1,83 @@
using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using Jackett.Common.Indexers;
using Jackett.Common.Models;
using Newtonsoft.Json.Linq;
using NUnit.Framework;
using Assert = NUnit.Framework.Assert;
namespace Jackett.Test.Models
{
class TestIndexer : BaseIndexer
{
public TestIndexer()
: base(id: "test_id",
name: "test_name",
description: "test_description",
link: "https://test.link/",
configService: null,
logger: null,
configData: null,
p: null)
{
}
public override TorznabCapabilities TorznabCaps { get; protected set; }
public override Task<IndexerConfigurationStatus> ApplyConfiguration(JToken configJson) => throw new NotImplementedException();
protected override Task<IEnumerable<ReleaseInfo>> PerformQuery(TorznabQuery query) => throw new NotImplementedException();
}
[TestFixture]
public class ResultPageTests
{
[Test]
public void TestXmlWithInvalidCharacters()
{
// 0x1A can't be represented in XML => https://stackoverflow.com/a/8506173
// some ascii and unicode characters
var text = "Title Ñ 理" + Convert.ToChar("\u001a") + Convert.ToChar("\u2813");
var validText = "Title Ñ 理" + Convert.ToChar("\u2813");
// link with characters that requires URL encode
var link = new Uri("https://example.com/" + text);
var validLink = "https://example.com/Title%20%C3%91%20%E7%90%86%1A%E2%A0%93";
var resultPage = new ResultPage(
new ChannelInfo // characters in channel info are safe because are provided by us
{
Link = link,
ImageUrl = link,
ImageLink = link
})
{
Releases = new List<ReleaseInfo>
{
new ReleaseInfo // these fields are from websites and they can be problematic
{
Title = text,
Guid = link,
Link = link,
Comments = link,
PublishDate = new DateTime(2020, 09, 22),
Description = text,
Author = text,
BookTitle = text,
BannerUrl = link,
InfoHash = text,
MagnetUri = link,
Origin = new TestIndexer()
}
}
};
var xml = resultPage.ToXml(link);
Assert.AreEqual(5, Regex.Matches(xml, validText).Count);
Assert.AreEqual(9, Regex.Matches(xml, validLink).Count);
// this should be in another test but it's here to avoid creating the whole object again
Assert.True(xml.Contains("Tue, 22 Sep 2020 00:00:00 "));
}
}
}