Feature/rutracker music album rerelease. resolves #11161 (#11275)

This commit is contained in:
XYZJR
2021-03-15 00:51:10 +01:00
committed by GitHub
parent 1e4a407aad
commit 6d79cab496
3 changed files with 302 additions and 107 deletions

View File

@@ -6,6 +6,7 @@ using System.Linq;
using System.Text; using System.Text;
using System.Text.RegularExpressions; using System.Text.RegularExpressions;
using System.Threading.Tasks; using System.Threading.Tasks;
using AngleSharp.Dom;
using AngleSharp.Html.Parser; using AngleSharp.Html.Parser;
using Jackett.Common.Models; using Jackett.Common.Models;
using Jackett.Common.Models.IndexerConfig; using Jackett.Common.Models.IndexerConfig;
@@ -15,6 +16,7 @@ using Jackett.Common.Utils;
using Jackett.Common.Utils.Clients; using Jackett.Common.Utils.Clients;
using Newtonsoft.Json.Linq; using Newtonsoft.Json.Linq;
using NLog; using NLog;
using NLog.Targets;
namespace Jackett.Common.Indexers namespace Jackett.Common.Indexers
{ {
@@ -1416,14 +1418,49 @@ namespace Jackett.Common.Indexers
protected override async Task<IEnumerable<ReleaseInfo>> PerformQuery(TorznabQuery query) protected override async Task<IEnumerable<ReleaseInfo>> PerformQuery(TorznabQuery query)
{ {
var releases = new List<ReleaseInfo>(); var searchUrl = CreateSearchUrlForQuery(query);
var searchString = query.SanitizedSearchTerm;
var results = await RequestWithCookiesAsync(searchUrl);
if (!results.ContentString.Contains("id=\"logged-in-username\""))
{
// re login
await ApplyConfiguration(null);
results = await RequestWithCookiesAsync(searchUrl);
}
var releases = new List<ReleaseInfo>();
try
{
var rows = GetReleaseRows(results);
foreach (var row in rows)
{
var release = ParseReleaseRow(row);
if (release != null)
{
releases.Add(release);
}
}
}
catch (Exception ex)
{
OnParseError(results.ContentString, ex);
}
return releases;
}
private string CreateSearchUrlForQuery(in TorznabQuery query)
{
var queryCollection = new NameValueCollection(); var queryCollection = new NameValueCollection();
var searchString = query.SanitizedSearchTerm;
// if the search string is empty use the getnew view // if the search string is empty use the getnew view
if (string.IsNullOrWhiteSpace(searchString)) if (string.IsNullOrWhiteSpace(searchString))
{
queryCollection.Add("nm", searchString); queryCollection.Add("nm", searchString);
}
else // use the normal search else // use the normal search
{ {
searchString = searchString.Replace("-", " "); searchString = searchString.Replace("-", " ");
@@ -1433,43 +1470,41 @@ namespace Jackett.Common.Indexers
} }
var searchUrl = SearchUrl + "?" + queryCollection.GetQueryString(); var searchUrl = SearchUrl + "?" + queryCollection.GetQueryString();
var results = await RequestWithCookiesAsync(searchUrl); return searchUrl;
if (!results.ContentString.Contains("id=\"logged-in-username\""))
{
// re login
await ApplyConfiguration(null);
results = await RequestWithCookiesAsync(searchUrl);
} }
try
private IHtmlCollection<IElement> GetReleaseRows(WebResult results)
{ {
var parser = new HtmlParser(); var parser = new HtmlParser();
var doc = parser.ParseDocument(results.ContentString); var doc = parser.ParseDocument(results.ContentString);
var rows = doc.QuerySelectorAll("table#tor-tbl > tbody > tr"); var rows = doc.QuerySelectorAll("table#tor-tbl > tbody > tr");
foreach (var row in rows) return rows;
}
private ReleaseInfo ParseReleaseRow(IElement row)
{
try try
{ {
var qDownloadLink = row.QuerySelector("td.tor-size > a.tr-dl"); var qDownloadLink = row.QuerySelector("td.tor-size > a.tr-dl");
if (qDownloadLink == null) // Expects moderation if (qDownloadLink == null) // Expects moderation
continue; return null;
var qDetailsLink = row.QuerySelector("td.t-title-col > div.t-title > a.tLink");
var qSize = row.QuerySelector("td.tor-size");
var details = new Uri(SiteLink + "forum/" + qDetailsLink.GetAttribute("href"));
var seeders = 0;
var qSeeders = row.QuerySelector("td:nth-child(7)");
if (qSeeders != null && !qSeeders.TextContent.Contains("дн"))
{
var seedersString = qSeeders.QuerySelector("b").TextContent;
if (!string.IsNullOrWhiteSpace(seedersString))
seeders = ParseUtil.CoerceInt(seedersString);
}
var timestr = row.QuerySelector("td:nth-child(10)").GetAttribute("data-ts_text");
var forum = row.QuerySelector("td.f-name-col > div.f-name > a");
var forumid = forum.GetAttribute("href").Split('=')[1];
var link = new Uri(SiteLink + "forum/" + qDownloadLink.GetAttribute("href")); var link = new Uri(SiteLink + "forum/" + qDownloadLink.GetAttribute("href"));
var size = ReleaseInfo.GetBytes(qSize.GetAttribute("data-ts_text"));
var qDetailsLink = row.QuerySelector("td.t-title-col > div.t-title > a.tLink");
var details = new Uri(SiteLink + "forum/" + qDetailsLink.GetAttribute("href"));
var category = GetCategoryOfRelease(row);
var size = GetSizeOfRelease(row);
var seeders = GetSeedersOfRelease(row);
var leechers = ParseUtil.CoerceInt(row.QuerySelector("td:nth-child(8)").TextContent); var leechers = ParseUtil.CoerceInt(row.QuerySelector("td:nth-child(8)").TextContent);
var grabs = ParseUtil.CoerceLong(row.QuerySelector("td:nth-child(9)").TextContent); var grabs = ParseUtil.CoerceLong(row.QuerySelector("td:nth-child(9)").TextContent);
var publishDate = DateTimeUtil.UnixTimestampToDateTime(long.Parse(timestr));
var publishDate = GetPublishDateOfRelease(row);
var release = new ReleaseInfo var release = new ReleaseInfo
{ {
MinimumRatio = 1, MinimumRatio = 1,
@@ -1483,14 +1518,13 @@ namespace Jackett.Common.Indexers
Peers = leechers + seeders, Peers = leechers + seeders,
Grabs = grabs, Grabs = grabs,
PublishDate = publishDate, PublishDate = publishDate,
Category = MapTrackerCatToNewznab(forumid), Category = category,
DownloadVolumeFactor = 1, DownloadVolumeFactor = 1,
UploadVolumeFactor = 1 UploadVolumeFactor = 1
}; };
// TODO finish extracting release variables to simiplify release initialization // TODO finish extracting release variables to simplify release initialization
if (release.Category.Contains(TorznabCatType.TV.ID) || if (IsAnyTvCategory(release.Category))
TorznabCatType.TV.SubCategories.Any(subCat => release.Category.Contains(subCat.ID)))
{ {
// extract season and episodes // extract season and episodes
var regex = new Regex(".+\\/\\s([^а-яА-я\\/]+)\\s\\/.+Сезон\\s*[:]*\\s+(\\d+).+(?:Серии|Эпизод)+\\s*[:]*\\s+(\\d+-*\\d*).+,\\s+(.+)\\][\\s]?(.*)"); var regex = new Regex(".+\\/\\s([^а-яА-я\\/]+)\\s\\/.+Сезон\\s*[:]*\\s+(\\d+).+(?:Серии|Эпизод)+\\s*[:]*\\s+(\\d+-*\\d*).+,\\s+(.+)\\][\\s]?(.*)");
@@ -1504,9 +1538,7 @@ namespace Jackett.Common.Indexers
release.Title = title; release.Title = title;
} }
else else if (IsAnyMovieCategory(release.Category))
if (release.Category.Contains(TorznabCatType.Movies.ID) ||
TorznabCatType.Movies.SubCategories.Any(subCat => release.Category.Contains(subCat.ID)))
{ {
// remove director's name from title // remove director's name from title
// rutracker movies titles look like: russian name / english name (russian director / english director) other stuff // rutracker movies titles look like: russian name / english name (russian director / english director) other stuff
@@ -1537,19 +1569,64 @@ namespace Jackett.Common.Indexers
release.Title = MoveFirstTagsToEndOfReleaseTitle(release.Title); release.Title = MoveFirstTagsToEndOfReleaseTitle(release.Title);
} }
releases.Add(release); if (release.Category.Contains(TorznabCatType.Audio.ID))
{
release.Title = DetectRereleaseInReleaseTitle(release.Title);
}
return release;
} }
catch (Exception ex) catch (Exception ex)
{ {
logger.Error($"{Id}: Error while parsing row '{row.OuterHtml}':\n\n{ex}"); logger.Error($"{Id}: Error while parsing row '{row.OuterHtml}':\n\n{ex}");
return null;
} }
} }
catch (Exception ex)
{
OnParseError(results.ContentString, ex);
}
return releases; private int GetSeedersOfRelease(in IElement row)
{
var seeders = 0;
var qSeeders = row.QuerySelector("td:nth-child(7)");
if (qSeeders != null && !qSeeders.TextContent.Contains("дн"))
{
var seedersString = qSeeders.QuerySelector("b").TextContent;
if (!string.IsNullOrWhiteSpace(seedersString))
seeders = ParseUtil.CoerceInt(seedersString);
}
return seeders;
}
private ICollection<int> GetCategoryOfRelease(in IElement row)
{
var forum = row.QuerySelector("td.f-name-col > div.f-name > a");
var forumid = forum.GetAttribute("href").Split('=')[1];
return MapTrackerCatToNewznab(forumid);
}
private long GetSizeOfRelease(in IElement row)
{
var qSize = row.QuerySelector("td.tor-size");
var size = ReleaseInfo.GetBytes(qSize.GetAttribute("data-ts_text"));
return size;
}
private DateTime GetPublishDateOfRelease(in IElement row)
{
var timestr = row.QuerySelector("td:nth-child(10)").GetAttribute("data-ts_text");
var publishDate = DateTimeUtil.UnixTimestampToDateTime(long.Parse(timestr));
return publishDate;
}
private bool IsAnyTvCategory(ICollection<int> category)
{
return category.Contains(TorznabCatType.TV.ID)
|| TorznabCatType.TV.SubCategories.Any(subCat => category.Contains(subCat.ID));
}
private bool IsAnyMovieCategory(ICollection<int> category)
{
return category.Contains(TorznabCatType.Movies.ID)
|| TorznabCatType.Movies.SubCategories.Any(subCat => category.Contains(subCat.ID));
} }
private string MoveAllTagsToEndOfReleaseTitle(string input) private string MoveAllTagsToEndOfReleaseTitle(string input)
@@ -1585,5 +1662,43 @@ namespace Jackett.Common.Indexers
output = output.Trim(); output = output.Trim();
return output; return output;
} }
/// <summary>
/// Searches the release title to find a 'year1/year2' pattern that would indicate that this is a re-release of an old music album.
/// If the release is found to be a re-release, this is added to the title as a new tag.
/// Not to be confused with discographies; they mostly follow the 'year1-year2' pattern.
/// </summary>
private string DetectRereleaseInReleaseTitle(string input)
{
var fullTitle = input;
var squareBracketTags = input.FindSubstringsBetween('[', ']', includeOpeningAndClosing:true);
input = input.RemoveSubstrings(squareBracketTags);
var roundBracketTags = input.FindSubstringsBetween('(', ')', includeOpeningAndClosing: true);
input = input.RemoveSubstrings(roundBracketTags);
var regex = new Regex(@"\d{4}");
var yearsInTitle = regex.Matches(input);
if (yearsInTitle == null || yearsInTitle.Count < 2)
{
//Can only be a re-release if there's at least 2 years in the title.
return fullTitle;
}
regex = new Regex(@"(\d{4}) *\/ *(\d{4})");
var regexMatch = regex.Match(input);
if (!regexMatch.Success)
{
//Not in the expected format. Return the unaltered title.
return fullTitle;
}
var originalYear = regexMatch.Groups[1].ToString();
fullTitle = fullTitle.Replace(regexMatch.ToString(), originalYear);
return fullTitle + "(Re-release)";
}
} }
} }

View File

@@ -7,6 +7,7 @@ using System.Linq;
using System.Security.Cryptography; using System.Security.Cryptography;
using System.Text; using System.Text;
using System.Text.RegularExpressions; using System.Text.RegularExpressions;
using System.Xml;
using AngleSharp.Dom; using AngleSharp.Dom;
using AngleSharp.Html; using AngleSharp.Html;
using Jackett.Common.Helpers; using Jackett.Common.Helpers;
@@ -185,8 +186,6 @@ namespace Jackett.Common.Utils
return sb.ToString(); return sb.ToString();
} }
public static string GenerateRandom(int length) public static string GenerateRandom(int length)
{ {
var chars = "abcdefghijklmnopqrstuvwxyz0123456789"; var chars = "abcdefghijklmnopqrstuvwxyz0123456789";
@@ -202,5 +201,63 @@ namespace Jackett.Common.Utils
return key; return key;
} }
} }
public static IEnumerable<int> AllIndexesOf(this string source, char value)
{
var index = source.IndexOf(value);
while (index != -1)
{
yield return index;
index = source.IndexOf(value, index + 1);
};
}
public static IEnumerable<int> AllIndexesOf(this string source, string value)
{
var index = source.IndexOf(value);
while (index != -1)
{
yield return index;
index = source.IndexOf(value, index + value.Length);
};
}
/// <summary>
/// Finds all substrings between two specified characters. If nested, both the parent and child substring are returned.
/// </summary>
public static IEnumerable<string> FindSubstringsBetween(this string source, char opening, char closing, bool includeOpeningAndClosing)
{
var openingIndexes = source.AllIndexesOf(opening).ToList();
var closingIndexes = source.AllIndexesOf(closing);
foreach (var closingIndex in closingIndexes.OrderBy(_ => _))
{
var potentialOpeningIndexes = openingIndexes.Where(x => x < closingIndex);
if (!potentialOpeningIndexes.Any())
continue;
var openingIndex = potentialOpeningIndexes.OrderByDescending(_ => _).First();
var substringIndex = openingIndex + 1;
var substringLength = closingIndex - substringIndex;
if (includeOpeningAndClosing)
{
substringIndex -= 1;
substringLength += 2;
}
yield return source.Substring(substringIndex, substringLength);
openingIndexes.RemoveAll(x => x == openingIndex);
}
}
public static string RemoveSubstrings(this string source, IEnumerable<string> substrings)
{
var result = source;
foreach (var substring in substrings.OrderByDescending(x => x.Length))
{
result = result.Replace(substring, string.Empty);
}
return result;
}
} }
} }

View File

@@ -123,5 +123,28 @@ namespace Jackett.Test.Common.Utils
CollectionAssert.AreEqual(combined, original.ToEnumerable()); CollectionAssert.AreEqual(combined, original.ToEnumerable());
CollectionAssert.AreEqual(duplicateKeys, original.ToEnumerable(true)); CollectionAssert.AreEqual(duplicateKeys, original.ToEnumerable(true));
} }
[Test]
public void FindSubstringsBetween_ValidEntries_Succeeds()
{
var stringParts = new string[] { "<test>", "<abc>", "<def>" };
var source = string.Concat(stringParts);
var results = source.FindSubstringsBetween('<', '>', true);
CollectionAssert.AreEqual(stringParts, results);
}
[Test]
public void FindSubstringsBetween_NestedEntries_Succeeds()
{
var stringParts = new string[] { "(test(abc))", "(def)", "(ghi)" };
var source = string.Concat(stringParts);
var results = source.FindSubstringsBetween('(', ')', false);
var expectedParts = new string[] { "abc", "test(abc)", "def", "ghi" };
CollectionAssert.AreEqual(expectedParts, results);
}
} }
} }