owntone-server/src/library/rssscanner.c

627 lines
15 KiB
C
Raw Normal View History

/*
* Copyright (C) 2020 whatdoineed2d/Ray
* based heavily on filescanner_playlist.c
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include <stdio.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
2020-03-21 22:20:37 +01:00
// For strptime()
#ifndef _XOPEN_SOURCE
#define _XOPEN_SOURCE
#endif
#include <time.h>
2020-03-21 22:20:37 +01:00
#include <event2/buffer.h>
#include "mxml-compat.h"
#include "conffile.h"
#include "logger.h"
#include "db.h"
#include "http.h"
#include "misc.h"
#include "misc_json.h"
#include "library.h"
#include "library/filescanner.h"
2020-03-21 22:20:37 +01:00
#define APPLE_PODCASTS_SERVER "https://podcasts.apple.com/"
#define APPLE_ITUNES_SERVER "https://itunes.apple.com/"
#define RSS_LIMIT_DEFAULT 10
enum rss_scan_type {
RSS_SCAN_RESCAN,
RSS_SCAN_META,
};
struct rss_item_info {
const char *title;
const char *pubdate;
const char *link;
const char *url;
const char *type;
};
static struct timeval rss_refresh_interval = { 3600, 0 };
2020-03-21 22:20:37 +01:00
// Forward
static void
rss_refresh(void *arg);
// RSS spec: https://validator.w3.org/feed/docs/rss2.html
2020-03-21 22:20:37 +01:00
static void
rss_date(struct tm *tm, const char *date)
{
// RFC822 https://tools.ietf.org/html/rfc822#section-5
// ie Fri, 07 Feb 2020 18:58:00 +0000
// ^^^^ ^^^^^
// optional ^^^^^
// could also be GMT/UT/EST/A..I/M..Z
2020-03-21 22:20:37 +01:00
const char *ptr;
time_t t;
memset(tm, 0, sizeof(struct tm));
2020-03-21 22:20:37 +01:00
ptr = strptime(date, "%a,%n", tm); // Looks for optional day of week
if (!ptr)
ptr = date;
ptr = strptime(ptr, "%d%n%b%n%Y%n%H:%M:%S%n", tm);
if (!ptr)
{
// date is junk, using current time
time(&t);
gmtime_r(&t, tm);
}
// TODO - adjust the TZ?
}
2020-03-21 22:20:37 +01:00
// Makes a request to Apple based on the Apple Podcast ID in rss_url. The JSON
// response is parsed to find the original feed's url. Example rss_url:
// https://podcasts.apple.com/is/podcast/cgp-grey/id974722423
static char *
apple_rss_feedurl_get(const char *rss_url)
{
struct http_client_ctx ctx;
struct evbuffer *evbuf;
char url[100];
const char *ptr;
2020-03-21 22:20:37 +01:00
unsigned podcast_id;
json_object *jresponse;
json_object *jfeedurl;
char *feedurl;
int ret;
ptr = strrchr(rss_url, '/');
if (!ptr)
{
DPRINTF(E_LOG, L_LIB, "Could not parse Apple Podcast RSS ID from '%s'\n", rss_url);
return NULL;
}
2020-03-21 22:20:37 +01:00
ret = sscanf(ptr, "/id%u", &podcast_id);
if (ret != 1)
{
DPRINTF(E_LOG, L_LIB, "Could not parse Apple Podcast RSS ID from '%s'\n", rss_url);
return NULL;
}
2020-03-21 22:20:37 +01:00
CHECK_NULL(L_LIB, evbuf = evbuffer_new());
snprintf(url, sizeof(url), "%slookup?id=%u", APPLE_ITUNES_SERVER, podcast_id);
memset(&ctx, 0, sizeof(struct http_client_ctx));
ctx.url = url;
ctx.input_body = evbuf;
ret = http_client_request(&ctx);
2020-03-21 22:20:37 +01:00
if (ret < 0 || ctx.response_code != HTTP_OK)
{
evbuffer_free(evbuf);
return NULL;
}
2020-03-21 22:20:37 +01:00
jresponse = jparse_obj_from_evbuffer(evbuf);
evbuffer_free(evbuf);
if (!jresponse)
{
2020-03-21 22:20:37 +01:00
DPRINTF(E_LOG, L_LIB, "Could not parse RSS Apple response, podcast id %u\n", podcast_id);
return NULL;
}
2020-03-21 22:20:37 +01:00
/* expect json resp - get feedUrl
* {
* "resultCount": 1,
* "results": [
* {
* "wrapperType": "track",
* "kind": "podcast",
* ...
* "collectionViewUrl": "https://podcasts.apple.com/us/podcast/cgp-grey/id974722423?uo=4",
* "feedUrl": "http://cgpgrey.libsyn.com/rss",
* ...
* "genres": [
* "Education",
* "Podcasts",
* "News"
* ]
* }
* ]
*}
*/
jfeedurl = JPARSE_SELECT(jresponse, "results", "feedUrl");
if (!jfeedurl || json_object_get_type(jfeedurl) != json_type_string)
{
2020-03-21 22:20:37 +01:00
DPRINTF(E_LOG, L_LIB, "Could not find RSS feedUrl in response from Apple, podcast id %u\n", podcast_id);
jparse_free(jresponse);
return NULL;
}
2020-03-21 22:20:37 +01:00
feedurl = safe_strdup(json_object_get_string(jfeedurl));
DPRINTF(E_DBG, L_LIB, "Mapped Apple podcast URL: '%s' -> '%s'\n", rss_url, feedurl);
jparse_free(jresponse);
return feedurl;
}
2020-03-21 22:20:37 +01:00
static struct playlist_info *
playlist_fetch(bool *is_new, const char *path)
{
2020-03-21 22:20:37 +01:00
struct playlist_info *pli;
int ret;
2020-03-21 22:20:37 +01:00
pli = db_pl_fetch_bypath(path);
if (pli)
{
db_pl_clear_items(pli->id);
*is_new = false;
return pli;
}
2020-03-21 22:20:37 +01:00
CHECK_NULL(L_SCAN, pli = calloc(1, sizeof(struct playlist_info)));
2020-03-21 22:20:37 +01:00
ret = playlist_fill(pli, path);
if (ret < 0)
2020-03-21 22:20:37 +01:00
goto error;
2020-03-21 22:20:37 +01:00
pli->directory_id = DIR_HTTP;
pli->type = PL_RSS;
pli->query_limit = RSS_LIMIT_DEFAULT;
ret = library_playlist_save(pli);
if (ret < 0)
goto error;
pli->id = ret;
*is_new = true;
return pli;
error:
DPRINTF(E_LOG, L_SCAN, "Error adding playlist for RSS feed '%s'\n", path);
free_pli(pli, 0);
return NULL;
}
2020-03-21 22:20:37 +01:00
static mxml_node_t *
rss_xml_get(const char *url)
{
2020-03-21 22:20:37 +01:00
struct http_client_ctx ctx = { 0 };
const char *raw = NULL;
mxml_node_t *xml = NULL;
char *feedurl;
int ret;
2020-03-21 22:20:37 +01:00
// Is it an apple podcast stream?
// ie https://podcasts.apple.com/is/podcast/cgp-grey/id974722423
if (strncmp(url, APPLE_PODCASTS_SERVER, strlen(APPLE_PODCASTS_SERVER)) == 0)
{
2020-03-21 22:20:37 +01:00
feedurl = apple_rss_feedurl_get(url);
if (!feedurl)
return NULL;
}
2020-03-21 22:20:37 +01:00
else
feedurl = strdup(url);
2020-03-21 22:20:37 +01:00
CHECK_NULL(L_LIB, ctx.input_body = evbuffer_new());
ctx.url = feedurl;
ret = http_client_request(&ctx);
if (ret < 0 || ctx.response_code != HTTP_OK)
{
2020-03-21 22:20:37 +01:00
DPRINTF(E_LOG, L_LIB, "Failed to fetch RSS from '%s' (return %d, error code %d)\n", ctx.url, ret, ctx.response_code);
goto cleanup;
}
2020-03-21 22:20:37 +01:00
evbuffer_add(ctx.input_body, "", 1);
2020-03-21 22:20:37 +01:00
raw = (const char*)evbuffer_pullup(ctx.input_body, -1);
xml = mxmlLoadString(NULL, raw, MXML_OPAQUE_CALLBACK);
if (!xml)
{
2020-03-21 22:20:37 +01:00
DPRINTF(E_LOG, L_LIB, "Failed to parse RSS XML from '%s'\n", ctx.url);
goto cleanup;
}
2020-03-21 22:20:37 +01:00
cleanup:
evbuffer_free(ctx.input_body);
free(feedurl);
return xml;
}
2020-03-21 22:20:37 +01:00
static int
rss_xml_parse_feed(const char **feed_title, const char **feed_author, mxml_node_t *xml)
{
mxml_node_t *channel;
mxml_node_t *node;
2020-03-21 22:20:37 +01:00
channel = mxmlFindElement(xml, xml, "channel", NULL, NULL, MXML_DESCEND);
if (!channel)
{
DPRINTF(E_LOG, L_LIB, "Invalid RSS/xml, missing 'channel' node\n");
return -1;
}
2020-03-21 22:20:37 +01:00
node = mxmlFindElement(channel, channel, "title", NULL, NULL, MXML_DESCEND_FIRST);
if (!node)
{
DPRINTF(E_LOG, L_LIB, "Invalid RSS/xml, missing 'title' node\n");
return -1;
}
*feed_title = mxmlGetOpaque(node);
2020-03-21 22:20:37 +01:00
node = mxmlFindElement(channel, channel, "itunes:author", NULL, NULL, MXML_DESCEND_FIRST);
*feed_author = node ? mxmlGetOpaque(node) : NULL;
return 0;
}
2020-03-21 22:20:37 +01:00
static int
rss_xml_parse_item(struct rss_item_info *ri, mxml_node_t *xml, void **saveptr)
{
2020-03-21 22:20:37 +01:00
mxml_node_t *item;
mxml_node_t *node;
const char *s;
2020-03-21 22:20:37 +01:00
if (*saveptr)
{
item = (mxml_node_t *)(*saveptr);
while ( (item = mxmlGetNextSibling(item)) )
{
s = mxmlGetElement(item);
if (s && strcmp(s, "item") == 0)
break;
}
*saveptr = item;
}
else
{
item = mxmlFindElement(xml, xml, "item", NULL, NULL, MXML_DESCEND);
*saveptr = item;
}
2020-03-21 22:20:37 +01:00
if (!item)
return -1; // No more items
2020-03-21 22:20:37 +01:00
memset(ri, 0, sizeof(struct rss_item_info));
2020-03-21 22:20:37 +01:00
node = mxmlFindElement(item, item, "title", NULL, NULL, MXML_DESCEND_FIRST);
ri->title = mxmlGetOpaque(node);
2020-03-21 22:20:37 +01:00
node = mxmlFindElement(item, item, "pubDate", NULL, NULL, MXML_DESCEND_FIRST);
ri->pubdate = mxmlGetOpaque(node);
2020-03-21 22:20:37 +01:00
node = mxmlFindElement(item, item, "link", NULL, NULL, MXML_DESCEND_FIRST);
ri->link = mxmlGetOpaque(node);
2020-03-21 22:20:37 +01:00
node = mxmlFindElement(item, item, "enclosure", NULL, NULL, MXML_DESCEND_FIRST);
ri->url = mxmlElementGetAttr(node, "url");
ri->type = mxmlElementGetAttr(node, "type");
2020-03-21 22:20:37 +01:00
return 0;
}
2020-03-21 22:20:37 +01:00
static void
mfi_metadata_fixup(struct media_file_info *mfi, struct rss_item_info *ri, const char *feed_title, const char *feed_author, uint32_t time_added)
{
struct tm tm;
// Always take the artist and album from the RSS feed and not the stream
free(mfi->artist);
mfi->artist = safe_strdup(feed_author);
free(mfi->album);
mfi->album = safe_strdup(feed_title);
// Some podcasts (Apple) can use mp4 streams which tend not to have decent tags so
// in those cases take info from the RSS and not the stream
2020-03-21 22:20:37 +01:00
if (!mfi->url)
mfi->url = safe_strdup(ri->link);
if (!mfi->genre || strcmp("(186)Podcast", mfi->genre) == 0)
{
2020-03-21 22:20:37 +01:00
free(mfi->genre);
mfi->genre = strdup("Podcast");
}
2020-04-01 21:46:15 +02:00
// The title from the xml is usually better quality
if (ri->title)
{
2020-03-21 22:20:37 +01:00
free(mfi->title);
2020-04-01 21:46:15 +02:00
mfi->title = strdup(ri->title);
}
2020-03-21 22:20:37 +01:00
// Remove, some can be very verbose
free(mfi->comment);
mfi->comment = NULL;
2020-03-21 22:20:37 +01:00
// Date is always from the RSS feed info
rss_date(&tm, ri->pubdate);
mfi->date_released = mktime(&tm);
mfi->year = 1900 + tm.tm_year;
2020-03-21 22:20:37 +01:00
mfi->media_kind = MEDIA_KIND_PODCAST;
2020-03-21 22:20:37 +01:00
mfi->time_added = time_added;
}
static int
2020-03-21 22:20:37 +01:00
rss_save(struct playlist_info *pli, int *count, enum rss_scan_type scan_type)
{
2020-03-21 22:20:37 +01:00
mxml_node_t *xml;
const char *feed_title;
const char *feed_author;
struct media_file_info mfi = { 0 };
struct rss_item_info ri;
uint32_t time_added;
void *ptr = NULL;
int ret;
2020-03-21 22:20:37 +01:00
xml = rss_xml_get(pli->path);
if (!xml)
{
2020-03-21 22:20:37 +01:00
DPRINTF(E_LOG, L_LIB, "Could not get RSS/xml from '%s' (id %d)\n", pli->path, pli->id);
return -1;
}
2020-03-21 22:20:37 +01:00
ret = rss_xml_parse_feed(&feed_title, &feed_author, xml);
if (ret < 0)
{
2020-03-21 22:20:37 +01:00
DPRINTF(E_LOG, L_LIB, "Invalid RSS/xml received from '%s' (id %d)\n", pli->path, pli->id);
mxmlDelete(xml);
return -1;
}
2020-03-21 22:20:37 +01:00
free(pli->title);
pli->title = safe_strdup(feed_title);
2020-03-21 22:20:37 +01:00
// Fake the time - useful when we are adding a new stream - since the
// newest podcasts are added first (the stream is most recent first)
// having time_added date which is older on the most recent episodes
// makes no sense so make all the dates the same for a singleu update
time_added = (uint32_t)time(NULL);
2020-03-21 22:20:37 +01:00
// Walk through the xml, saving each item
*count = 0;
db_transaction_begin();
2020-03-21 22:20:37 +01:00
while ((ret = rss_xml_parse_item(&ri, xml, &ptr)) == 0 && (*count < pli->query_limit))
{
2020-03-21 22:20:37 +01:00
if (library_is_exiting())
{
db_transaction_rollback();
mxmlDelete(xml);
return -1;
}
if (!ri.url)
{
DPRINTF(E_WARN, L_LIB, "Missing URL for item '%s' (date %s) in RSS feed '%s'\n", ri.title, ri.pubdate, feed_title);
continue;
}
2020-03-21 22:20:37 +01:00
db_pl_add_item_bypath(pli->id, ri.url);
(*count)++;
2020-03-21 22:20:37 +01:00
// Try to just ping if already in library
if (scan_type == RSS_SCAN_RESCAN)
{
ret = db_file_ping_bypath(ri.url, 0);
if (ret > 0)
continue;
}
else if (scan_type == RSS_SCAN_META)
{
// Using existing file id if already in library, resulting in update but preserving play_count etc
mfi.id = db_file_id_bypath(ri.url);
if (mfi.id > 0)
time_added = 0;
}
2020-03-21 22:20:37 +01:00
scan_metadata_stream(&mfi, ri.url);
2020-03-21 22:20:37 +01:00
mfi_metadata_fixup(&mfi, &ri, feed_title, feed_author, time_added);
2020-03-21 22:20:37 +01:00
library_media_save(&mfi);
2020-03-21 22:20:37 +01:00
free_mfi(&mfi, 1);
}
2020-03-21 22:20:37 +01:00
db_transaction_end();
mxmlDelete(xml);
2020-03-21 22:20:37 +01:00
return 0;
}
2020-03-21 22:20:37 +01:00
static int
rss_scan(const char *path, enum rss_scan_type scan_type)
{
struct playlist_info *pli;
bool pl_is_new;
int count;
int ret;
2020-03-21 22:20:37 +01:00
// Fetches or creates playlist, clears playlistitems
pli = playlist_fetch(&pl_is_new, path);
if (!pli)
return -1;
2020-03-21 22:20:37 +01:00
// Retrieves the RSS and reads the feed, saving each item as a track, and also
// adds the relationship to playlistitems. The pli will also be updated with
// metadata from the RSS.
ret = rss_save(pli, &count, scan_type);
if (ret < 0)
goto error;
2020-03-21 22:20:37 +01:00
// Save the playlist again, title etc may have been modified by rss_save().
// This also updates the db_timestamp which protects the RSS from deletion.
ret = library_playlist_save(pli);
if (ret < 0)
goto error;
2020-03-21 22:20:37 +01:00
DPRINTF(E_INFO, L_SCAN, "Added or updated %d items from RSS feed '%s' (id %d)\n", count, path, pli->id);
2020-03-21 22:20:37 +01:00
free_pli(pli, 0);
return 0;
error:
2020-03-21 22:20:37 +01:00
if (pl_is_new)
db_pl_delete(pli->id);
free_pli(pli, 0);
return -1;
}
2020-03-21 22:20:37 +01:00
static void
rss_scan_all(enum rss_scan_type scan_type)
{
2020-03-21 22:20:37 +01:00
struct query_params qp = { 0 };
struct db_playlist_info dbpli;
2020-03-21 22:20:37 +01:00
time_t start;
time_t end;
int count;
int ret;
2020-03-21 22:20:37 +01:00
DPRINTF(E_DBG, L_LIB, "Refreshing RSS feeds\n");
2020-03-21 22:20:37 +01:00
start = time(NULL);
2020-03-21 22:20:37 +01:00
qp.type = Q_PL;
qp.sort = S_PLAYLIST;
qp.filter = db_mprintf("(f.type = %d)", PL_RSS);
2020-03-21 22:20:37 +01:00
ret = db_query_start(&qp);
if (ret < 0)
{
DPRINTF(E_LOG, L_LIB, "Failed to find current RSS feeds from db\n");
2020-03-21 22:20:37 +01:00
free(qp.filter);
return;
}
2020-03-21 22:20:37 +01:00
count = 0;
while (((ret = db_query_fetch_pl(&qp, &dbpli)) == 0) && (dbpli.path))
{
2020-03-21 22:20:37 +01:00
ret = rss_scan(dbpli.path, scan_type);
if (ret == 0)
count++;
}
2020-03-21 22:20:37 +01:00
db_query_end(&qp);
free(qp.filter);
2020-03-21 22:20:37 +01:00
end = time(NULL);
if (count == 0)
return;
library_callback_schedule(rss_refresh, NULL, &rss_refresh_interval, LIBRARY_CB_ADD_OR_REPLACE);
2020-03-21 22:20:37 +01:00
DPRINTF(E_INFO, L_LIB, "Refreshed %d RSS feeds in %.f sec (scan type %d)\n", count, difftime(end, start), scan_type);
}
static void
2020-03-21 22:20:37 +01:00
rss_refresh(void *arg)
{
2020-03-21 22:20:37 +01:00
rss_scan_all(RSS_SCAN_RESCAN);
}
static int
2020-03-21 22:20:37 +01:00
rss_rescan(void)
{
2020-03-21 22:20:37 +01:00
rss_scan_all(RSS_SCAN_RESCAN);
2020-03-21 22:20:37 +01:00
return LIBRARY_OK;
}
static int
2020-03-21 22:20:37 +01:00
rss_metascan(void)
{
2020-03-21 22:20:37 +01:00
rss_scan_all(RSS_SCAN_META);
2020-03-21 22:20:37 +01:00
return LIBRARY_OK;
}
static int
2020-03-21 22:20:37 +01:00
rss_fullscan(void)
{
2020-03-21 22:20:37 +01:00
DPRINTF(E_LOG, L_LIB, "RSS feeds removed during full-rescan\n");
return LIBRARY_OK;
}
2020-03-21 22:20:37 +01:00
static int
rss_add(const char *path)
{
int ret;
2020-03-21 22:20:37 +01:00
if (strncmp(path, "http://", 7) != 0 && strncmp(path, "https://", 8) != 0)
{
2020-03-21 22:20:37 +01:00
DPRINTF(E_SPAM, L_LIB, "Invalid RSS path '%s'\n", path);
return LIBRARY_PATH_INVALID;
}
2020-03-21 22:20:37 +01:00
DPRINTF(E_DBG, L_LIB, "Adding RSS '%s'\n", path);
2020-03-21 22:20:37 +01:00
ret = rss_scan(path, RSS_SCAN_RESCAN);
if (ret < 0)
return LIBRARY_PATH_INVALID;
library_callback_schedule(rss_refresh, NULL, &rss_refresh_interval, LIBRARY_CB_ADD_OR_REPLACE);
2020-03-21 22:20:37 +01:00
return LIBRARY_OK;
}
struct library_source rssscanner =
{
2020-03-21 22:20:37 +01:00
.name = "RSS feeds",
.disabled = 0,
.initscan = rss_rescan,
2020-03-21 22:20:37 +01:00
.rescan = rss_rescan,
.metarescan = rss_metascan,
.fullrescan = rss_fullscan,
.item_add = rss_add,
};