[scan] RSS support: new library source - RSS scanner; periodically updates RSS feeds as found in db once subuscribed. Auto translates apple podcasts to RSS link

This commit is contained in:
whatdoineed2do/Ray 2020-03-08 20:07:44 +00:00 committed by ejurgensen
parent 8b02ea9d6b
commit d40131d38d
2 changed files with 751 additions and 0 deletions

View File

@ -103,6 +103,7 @@ forked_daapd_SOURCES = main.c \
library/filescanner.c library/filescanner.h \
library/filescanner_ffmpeg.c library/filescanner_playlist.c \
library/filescanner_smartpl.c $(ITUNES_SRC) \
library/rssscanner.c \
library.c library.h \
$(MDNS_SRC) mdns.h \
remote_pairing.c remote_pairing.h \

750
src/library/rssscanner.c Normal file
View File

@ -0,0 +1,750 @@
/*
* Copyright (C) 2020 whatdoineed2d/Ray
* based heavily on filescanner_playlist.c
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include <stdio.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#ifndef _XOPEN_SOURCE
#define _XOPEN_SOURCE
#endif
#include <time.h>
#include <event2/event.h>
#include <mxml.h>
#include "mxml-compat.h"
#include "conffile.h"
#include "logger.h"
#include "db.h"
#include "http.h"
#include "misc.h"
#include "misc_json.h"
#include "library.h"
#include "library/filescanner.h"
static struct event *rssev;
static struct timeval rss_refresh_interval = { 3600, 0 };
static bool scanning;
// RSS spec: https://validator.w3.org/feed/docs/rss2.html
static bool
rss_date(struct tm *tm, const char *date)
{
// RFC822 https://tools.ietf.org/html/rfc822#section-5
// ie Fri, 07 Feb 2020 18:58:00 +0000
// ^^^^ ^^^^^
// optional ^^^^^
// could also be GMT/UT/EST/A..I/M..Z
char *ptr;
time_t t;
memset(tm, 0, sizeof(struct tm));
ptr = strptime(date, "%a,%n", tm);
ptr = strptime(ptr ? ptr : date, "%d%n%b%n%Y%n%H:%M:%S%n", tm);
if (!ptr)
{
// date is junk, using current time
time(&t);
gmtime_r(&t, tm);
return false;
}
// TODO - adjust the TZ?
return true;
}
// uses incoming buf for result but if too smal, returns new buf
static char*
process_apple_rss(const char *rss_url)
{
struct http_client_ctx ctx;
struct evbuffer *evbuf;
char url[100];
char *buf = NULL;
unsigned podid; // apple podcast id
json_object *json = NULL;
json_object *jsonra = NULL;
const char *feedURL;
const char *ptr;
int ret;
// ask for the json to get feedUrl
// https://itunes.apple.com/lookup?id=974722423
ptr = strrchr(rss_url, '/');
if (!ptr)
{
DPRINTF(E_LOG, L_LIB, "Could not parse Apple Podcast RSS ID from '%s'\n", rss_url);
return NULL;
}
if (sscanf(ptr, "/id%u", &podid) != 1)
{
DPRINTF(E_LOG, L_LIB, "Could not parse Apple Podcast RSS ID from '%s'\n", rss_url);
return NULL;
}
evbuf = evbuffer_new();
if (!evbuf)
return false;
snprintf(url, sizeof(url), "https://itunes.apple.com/lookup?id=%u", podid);
memset(&ctx, 0, sizeof(struct http_client_ctx));
ctx.url = url;
ctx.input_body = evbuf;
ret = http_client_request(&ctx);
if (ret < 0 || (ret && ctx.response_code != HTTP_OK))
{
evbuffer_free(evbuf);
return NULL;
}
json = jparse_obj_from_evbuffer(evbuf);
if (!json)
{
DPRINTF(E_LOG, L_LIB, "Could not parse RSS apple response, podcast id %u\n", podid);
}
else
{
/* expect json resp - get feedUrl
* {
* "resultCount": 1,
* "results": [
* {
* "wrapperType": "track",
* "kind": "podcast",
* ...
* "collectionViewUrl": "https://podcasts.apple.com/us/podcast/cgp-grey/id974722423?uo=4",
* "feedUrl": "http://cgpgrey.libsyn.com/rss",
* ...
* "genres": [
* "Education",
* "Podcasts",
* "News"
* ]
* }
* ]
*}
*/
if (json_object_object_get_ex(json, "results", &jsonra) && (feedURL = jparse_str_from_array(jsonra, 0, "feedUrl")) )
{
buf = strcpy(malloc(strlen(feedURL)+1), feedURL);
DPRINTF(E_DBG, L_LIB, "mapped apple podcast URL: %s -> %s\n", rss_url, buf);
}
else
DPRINTF(E_DBG, L_LIB, "Could not parse feedURL from RSS apple, podcast id %u\n", podid);
}
jparse_free(json);
evbuffer_free(evbuf);
return buf;
}
#ifdef RSS_DEBUG
static void
rss_playlist_items(int plid)
{
struct query_params qp;
struct db_media_file_info dbpli;
int ret;
memset(&qp, 0, sizeof(struct query_params));
qp.type = Q_PLITEMS;
qp.idx_type = I_NONE;
qp.id = plid;
ret = db_query_start(&qp);
if (ret < 0)
{
db_query_end(&qp);
return;
}
while (((ret = db_query_fetch_file(&qp, &dbpli)) == 0) && (dbpli.id))
{
DPRINTF(E_LOG, L_LIB, "plid=%u { id=%s title=%s path=%s }\n", plid, dbpli.id, dbpli.title, dbpli.path);
}
db_query_end(&qp);
return;
}
#endif
static int
map_rss_item_to_mfi(struct media_file_info *mfi, int pl_id, const char *rss_item_url, const char *rss_item_type, const char *rss_feed_author, const char *rss_feed_title, const char *rss_item_title, const char *rss_item_link, const char *rss_item_pubDate, time_t mtime)
{
struct tm tm;
memset(mfi, 0, sizeof(struct media_file_info));
scan_metadata_stream(mfi, rss_item_url);
if (mfi->song_length == 0 && mfi->file_size == 0)
{
DPRINTF(E_INFO, L_LIB, "Ignoring item (empty media) RSS id: %d name: '%s' url: %s pubdate: %s title: '%s'\n", pl_id, rss_feed_title, rss_item_url, rss_item_pubDate, rss_item_title);
return -1;
}
// Always take the meta from media file if possible; some podcasts
// (apple) can use mp4 streams which tend not to have decent tags so
// in those cases take info from the RSS and not the stream
if (!mfi->artist) mfi->artist = safe_strdup(rss_feed_author);
if (!mfi->album) mfi->album = safe_strdup(rss_feed_title);
if (!mfi->url) mfi->url = safe_strdup(rss_item_link);
if (!mfi->genre || strcmp("(186)Podcast", mfi->genre) == 0)
{
free(mfi->genre);
mfi->genre = strdup("Podcast");
}
// Title not valid on most mp4 (it becomes the url obj) so take from RSS feed
if (rss_item_type && strncmp("video", rss_item_type, 5) == 0)
{
free(mfi->title);
mfi->title = safe_strdup(rss_item_title);
}
// Ignore this - some can be very verbose - we don't show use these
// on the podcast
free(mfi->comment); mfi->comment = NULL;
// date is always from the RSS feed info
rss_date(&tm, rss_item_pubDate);
mfi->date_released = mktime(&tm);
mfi->year = 1900 + tm.tm_year;
mfi->media_kind = MEDIA_KIND_PODCAST;
// Fake the time - useful when we are adding a new stream - since the
// newest podcasts are added first (the stream is most recent first)
// having time_added date which is older on the most recent episodes
// makes no sense so make all the dates the same for a singleu update
mfi->time_added = mtime;
mfi->id = db_file_id_bypath(rss_item_url);
return 0;
}
// Only add required number of feeds items when limit > 0
int
rss_scan_feed(int pl_id, const char *url, long limit, unsigned *nadded)
{
struct media_file_info mfi;
char *vpath = NULL;
int feed_file_id;
unsigned vpathlen = 0;
unsigned len = 0;
time_t mtime;
char *apple_url = NULL;
const char *rss_xml = NULL;
mxml_node_t *tree = NULL;
mxml_node_t *channel;
mxml_node_t *node;
mxml_node_t *item;
const char *rss_feed_title = NULL;
const char *rss_feed_author = NULL;
const char *rss_item_title = NULL;
const char *rss_item_pubDate = NULL;
const char *rss_item_url = NULL;
const char *rss_item_link = NULL;
const char *rss_item_type = NULL;
struct http_client_ctx ctx;
struct evbuffer *evbuf;
int ret = -1;
DPRINTF(E_DBG, L_LIB, "Refreshing RSS id: %u url: %s limit: %ld\n", pl_id, url, limit);
db_pl_ping(pl_id);
db_pl_ping_items_bymatch("http://", pl_id);
db_pl_ping_items_bymatch("https://", pl_id);
evbuf = evbuffer_new();
if (!evbuf)
goto cleanup;
// Is it an apple podcast stream?
// ie https://podcasts.apple.com/is/podcast/cgp-grey/id974722423
if (strncmp(url, "https://podcasts.apple.com/", 27) == 0)
apple_url = process_apple_rss(url);
memset(&ctx, 0, sizeof(struct http_client_ctx));
ctx.url = apple_url ? apple_url : url;
ctx.input_body = evbuf;
ret = http_client_request(&ctx);
if (ret < 0 || (ret && ctx.response_code != HTTP_OK))
{
DPRINTF(E_WARN, L_LIB, "Failed to fetch RSS id: %u url: %s resp: %d\n", pl_id, url, ctx.response_code);
ret = -1;
goto cleanup;
}
ret = -1;
evbuffer_add(ctx.input_body, "", 1);
rss_xml = (const char*)evbuffer_pullup(ctx.input_body, -1);
if (!rss_xml || strlen(rss_xml) == 0)
{
DPRINTF(E_WARN, L_LIB, "Failed to fetch valid RSS/xml data RSS id: %u url: %sn", pl_id, url);
ret = LIBRARY_PATH_INVALID;
goto cleanup;
}
tree = mxmlLoadString(NULL, rss_xml, MXML_OPAQUE_CALLBACK);
channel = mxmlFindElement(tree, tree, "channel", NULL, NULL, MXML_DESCEND);
if (channel == NULL)
{
DPRINTF(E_WARN, L_LIB, "Invalid RSS/xml, missing 'channel' node - RSS id: %u url: %s\n", pl_id, url);
DPRINTF(E_DBG, L_LIB, "RSS xml len: %ld xml: { %s }\n", strlen(rss_xml), rss_xml);
ret = LIBRARY_PATH_INVALID;
goto cleanup;
}
node = mxmlFindElement(channel, channel, "title", NULL, NULL, MXML_DESCEND);
if (!node)
{
DPRINTF(E_WARN, L_LIB, "Invalid RSS/xml, missing 'title' - RSS id: %u url: %s\n", pl_id, url);
ret = LIBRARY_PATH_INVALID;
goto cleanup;
}
rss_feed_title = mxmlGetOpaque(node);
node = mxmlFindElement(channel, channel, "itunes:author", NULL, NULL, MXML_DESCEND);
if (node)
rss_feed_author = mxmlGetOpaque(node);
time(&mtime);
ret = 0;
memset(&mfi, 0, sizeof(struct media_file_info));
for (node = mxmlFindElement(channel, channel, "item", NULL, NULL, MXML_DESCEND);
node != NULL;
node = mxmlFindElement(node, channel, "item", NULL, NULL, MXML_DESCEND))
{
if (library_is_exiting())
{
DPRINTF(E_WARN, L_LIB, "Abandoning RSS feed refresh due to library exit, will need to rollback pl: %d url: %s\n", pl_id, url);
ret = LIBRARY_ERROR;
break;
}
item = mxmlFindElement(node, node, "title", NULL, NULL, MXML_DESCEND);
rss_item_title = mxmlGetOpaque(item);
item = mxmlFindElement(node, node, "pubDate", NULL, NULL, MXML_DESCEND);
rss_item_pubDate = mxmlGetOpaque(item);
item = mxmlFindElement(node, node, "link", NULL, NULL, MXML_DESCEND);
rss_item_link = mxmlGetOpaque(item);
item = mxmlFindElement(node, node, "enclosure", NULL, NULL, MXML_DESCEND);
rss_item_url = mxmlElementGetAttr(item, "url");
rss_item_type = mxmlElementGetAttr(item, "type");
DPRINTF(E_DBG, L_LIB, "Feed provides RSS id: %d name: '%s' pubDate: %s url: %s title: '%s'\n", pl_id, rss_feed_title, rss_item_pubDate, rss_item_url, rss_item_title);
if (!rss_item_url)
continue;
len = strlen(rss_item_url)+2;
if (len > vpathlen)
{
vpathlen = len;
free(vpath);
vpath = malloc(len);
}
sprintf(vpath, "/%s", rss_item_url);
// check if this item is already in the db - if so, we can stop since the RSS is given to us as LIFO stream
if ((feed_file_id = db_file_id_by_virtualpath_match(vpath)) > 0)
{
DPRINTF(E_DBG, L_LIB, "Most recent DB RSS id: %d name: '%s' url: %s file_id: %d pubdate: %s title: '%s'\n", pl_id, rss_feed_title, url, feed_file_id, rss_item_pubDate, rss_item_title);
break;
}
DPRINTF(E_INFO, L_LIB, "Adding item to RSS id: %d name: '%s' url: %s pubdate: %s title: '%s'\n", pl_id, rss_feed_title, rss_item_url, rss_item_pubDate, rss_item_title);
ret = map_rss_item_to_mfi(&mfi, pl_id, rss_item_url, rss_item_type, rss_feed_author, rss_feed_title, rss_item_title, rss_item_link, rss_item_pubDate, mtime);
if (ret < 0)
{
free_mfi(&mfi, 1);
continue;
}
ret = library_media_save(&mfi);
free_mfi(&mfi, 1);
if (ret < 0)
{
DPRINTF(E_INFO, L_LIB, "Failed to save item for RSS %s\n", url);
break;
}
ret = db_pl_add_item_bypath(pl_id, rss_item_url);
if (ret < 0)
{
DPRINTF(E_LOG, L_LIB, "Failed to add item for RSS %s\n", url);
break;
}
*nadded = *nadded +1;
if (*nadded%50 == 0)
{
DPRINTF(E_INFO, L_LIB, "RSS added %d entries...\n", *nadded);
}
if (limit > 0 && *nadded == limit)
{
DPRINTF(E_INFO, L_LIB, "RSS added limit reached, added %d entries...\n", *nadded);
break;
}
}
cleanup:
evbuffer_free(evbuf);
mxmlDelete(tree);
free(vpath);
free(apple_url);
return ret;
}
static int
rss_item_add(const char *name, const char *path, int limit)
{
int pl_id = -1;
struct playlist_info *pli;
struct playlist_info newpli;
time_t now;
unsigned nadded = 0;
int ret = 0;
DPRINTF(E_DBG, L_LIB, "RSS working on: '%s' '%s'\n", name, path);
if (strncmp(path, "http://", 7) != 0 && strncmp(path, "https://", 8) != 0)
{
DPRINTF(E_LOG, L_LIB, "Invalid RSS path '%s'\n", path);
return -1;
}
time(&now);
pli = db_pl_fetch_bypath(path);
if (pli)
{
DPRINTF(E_LOG, L_LIB, "Duplicate RSS exists id: %d path: %s\n", pli->id, path);
free_pli(pli, 0);
return LIBRARY_ERROR;
}
memset(&newpli, 0, sizeof(struct playlist_info));
newpli.type = PL_RSS;
newpli.path = strdup(path);
newpli.title = strdup(name);
newpli.virtual_path = malloc(strlen(path)+2);
sprintf(newpli.virtual_path, "/%s", path);
newpli.directory_id = DIR_HTTP;
db_transaction_begin();
pl_id = library_playlist_save(&newpli);
free_pli(&newpli, 1);
if (pl_id < 0)
{
DPRINTF(E_LOG, L_LIB, "Failed to create RSS id path: %s\n", path);
ret = -1;
goto rollback_error;
}
DPRINTF(E_INFO, L_LIB, "New RSS, created id: %d path: %s\n", pl_id, path);
// Determine if its really for us -- if not return LIBRARY_PATH_INVALID
ret = rss_scan_feed(pl_id, path, limit, &nadded);
if (ret < 0)
{
DPRINTF(E_LOG, L_LIB, "Failed to add RSS, dropping id: %d path: %s\n", pl_id, path);
goto rollback_error;
}
db_transaction_end();
DPRINTF(E_LOG, L_LIB, "Done processing RSS %s added %u items\n", path, nadded);
return LIBRARY_OK;
rollback_error:
db_transaction_rollback();
return ret;
}
static void
rss_protect_feeds()
{
struct query_params query_params;
struct db_playlist_info dbpli;
unsigned feeds = 0;
int pl_id;
int ret = 0;
memset(&query_params, 0, sizeof(struct query_params));
DPRINTF(E_DBG, L_LIB, "Protecting RSS feeds\n");
query_params.type = Q_PL;
query_params.sort = S_PLAYLIST;
query_params.filter = db_mprintf("(f.type = %d)", PL_RSS);
ret = db_query_start(&query_params);
if (ret < 0)
{
DPRINTF(E_LOG, L_LIB, "Failed to find current RSS feeds from db\n");
goto error;
}
while (((ret = db_query_fetch_pl(&query_params, &dbpli)) == 0) && (dbpli.id))
{
pl_id = atoi(dbpli.id);
DPRINTF(E_DBG, L_LIB, "Protecting feed id: %d '%s' at %s\n", pl_id, dbpli.title, dbpli.path);
db_pl_ping(pl_id);
db_pl_ping_items_bymatch("http://", pl_id);
db_pl_ping_items_bymatch("https://", pl_id);
++feeds;
}
db_query_end(&query_params);
DPRINTF(E_DBG, L_LIB, "Completed protecing RSS feeds: %u\n", feeds);
error:
free(query_params.filter);
}
static int
rss_refresh()
{
struct query_params query_params;
struct db_playlist_info dbpli;
unsigned feeds = 0;
unsigned nadded = 0;
int pl_id;
int ret = 0;
memset(&query_params, 0, sizeof(struct query_params));
DPRINTF(E_INFO, L_LIB, "Refreshing RSS feeds\n");
scanning = true;
query_params.type = Q_PL;
query_params.sort = S_PLAYLIST;
query_params.filter = db_mprintf("(f.type = %d)", PL_RSS);
ret = db_query_start(&query_params);
if (ret < 0)
{
DPRINTF(E_LOG, L_LIB, "Failed to find current RSS feeds from db\n");
goto error;
}
while (((ret = db_query_fetch_pl(&query_params, &dbpli)) == 0) && (dbpli.id))
{
if (library_is_exiting())
{
pl_id = atoi(dbpli.id);
DPRINTF(E_DBG, L_LIB, "library is exiting, protecting feed id: %d '%s' at %s\n", pl_id, dbpli.title, dbpli.path);
db_pl_ping(pl_id);
db_pl_ping_items_bymatch("http://", pl_id);
db_pl_ping_items_bymatch("https://", pl_id);
}
else
{
DPRINTF(E_DBG, L_LIB, "refreshing '%s' url: '%s' last update: %s", dbpli.title, dbpli.path, dbpli.db_timestamp);
db_transaction_begin();
ret = rss_scan_feed(atol(dbpli.id), dbpli.path, -1, &nadded);
if (ret < 0)
{
db_transaction_rollback();
if (!library_is_exiting())
break;
pl_id = atoi(dbpli.id);
DPRINTF(E_DBG, L_LIB, "rolled back RSS update, library is exiting, protecting feed id: %d '%s' at %s\n", pl_id, dbpli.title, dbpli.path);
db_pl_ping(pl_id);
db_pl_ping_items_bymatch("http://", pl_id);
db_pl_ping_items_bymatch("https://", pl_id);
continue;
}
db_transaction_end();
++feeds;
}
}
db_query_end(&query_params);
scanning = false;
DPRINTF(E_INFO, L_LIB, "%s RSS refresh, feeds: %u items: %u\n", ret == 0 ? "Completed" : "Partial", feeds, nadded);
error:
free(query_params.filter);
evtimer_add(rssev, &rss_refresh_interval);
return ret;
}
static void
rss_refresh_cb(int fd, short what, void *arg)
{
rss_refresh();
}
/* Thread: library */
static int
rss_rescan()
{
time_t start;
time_t end;
int ret;
if (scanning)
{
DPRINTF(E_DBG, L_LIB, "Scan already in progress, rescan ignored\n");
return 0;
}
start = time(NULL);
scanning = true;
ret = rss_refresh();
scanning = false;
end = time(NULL);
DPRINTF(E_LOG, L_LIB, "RSS scan completed in %.f sec\n", difftime(end, start));
return ret;
}
static int
rss_metarescan()
{
time_t start;
time_t end;
if (scanning)
{
DPRINTF(E_DBG, L_LIB, "Scan already in progress, meta rescan ignored\n");
return 0;
}
start = time(NULL);
scanning = true;
rss_protect_feeds();
scanning = false;
end = time(NULL);
DPRINTF(E_LOG, L_LIB, "RSS meta scan completed in %.f sec\n", difftime(end, start));
return 0;
}
static int
rss_fullrescan()
{
DPRINTF(E_LOG, L_LIB, "RSS fullscan not implemented - RSS feeds will be lost\n");
return 0;
}
int
rss_item_remove(const char *url)
{
struct playlist_info *pli;
int ret;
DPRINTF(E_DBG, L_LIB, "removing RSS: '%s'\n", url);
pli = db_pl_fetch_bypath(url);
if (!pli)
{
DPRINTF(E_INFO, L_LIB, "Cannot remove RSS - No such RSS feed: '%s'\n", url);
return LIBRARY_ERROR;
}
if (pli->type == PL_RSS)
ret = db_pl_purge_byid(pli->id);
else
ret = LIBRARY_PATH_INVALID;
free_pli(pli, 0);
return ret;
}
static int
init()
{
DPRINTF(E_INFO, L_LIB, "RSS refresh_period: %lu seconds\n", rss_refresh_interval.tv_sec);
scanning = false;
rssev = library_register_event(rss_refresh_cb, NULL, &rss_refresh_interval);
return 0;
}
static void
deinit()
{
event_free(rssev);
}
struct library_source rssscanner =
{
.name = "RSS feed source",
.disabled = 0,
.init = init,
.deinit = deinit,
.rescan = rss_rescan,
.metarescan = rss_metarescan,
.initscan = rss_rescan,
.fullrescan = rss_fullrescan,
.item_add = rss_item_add,
.item_remove = rss_item_remove,
};