From d40131d38d2a8b4215512a864a5b4a99fc4b9c5e Mon Sep 17 00:00:00 2001 From: whatdoineed2do/Ray Date: Sun, 8 Mar 2020 20:07:44 +0000 Subject: [PATCH] [scan] RSS support: new library source - RSS scanner; periodically updates RSS feeds as found in db once subuscribed. Auto translates apple podcasts to RSS link --- src/Makefile.am | 1 + src/library/rssscanner.c | 750 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 751 insertions(+) create mode 100644 src/library/rssscanner.c diff --git a/src/Makefile.am b/src/Makefile.am index 0ad66365..81c2c1be 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -103,6 +103,7 @@ forked_daapd_SOURCES = main.c \ library/filescanner.c library/filescanner.h \ library/filescanner_ffmpeg.c library/filescanner_playlist.c \ library/filescanner_smartpl.c $(ITUNES_SRC) \ + library/rssscanner.c \ library.c library.h \ $(MDNS_SRC) mdns.h \ remote_pairing.c remote_pairing.h \ diff --git a/src/library/rssscanner.c b/src/library/rssscanner.c new file mode 100644 index 00000000..ed58c642 --- /dev/null +++ b/src/library/rssscanner.c @@ -0,0 +1,750 @@ +/* + * Copyright (C) 2020 whatdoineed2d/Ray + * based heavily on filescanner_playlist.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef _XOPEN_SOURCE +#define _XOPEN_SOURCE +#endif +#include + +#include +#include +#include "mxml-compat.h" + +#include "conffile.h" +#include "logger.h" +#include "db.h" +#include "http.h" +#include "misc.h" +#include "misc_json.h" +#include "library.h" +#include "library/filescanner.h" + + +static struct event *rssev; +static struct timeval rss_refresh_interval = { 3600, 0 }; +static bool scanning; + + +// RSS spec: https://validator.w3.org/feed/docs/rss2.html + +static bool +rss_date(struct tm *tm, const char *date) +{ + // RFC822 https://tools.ietf.org/html/rfc822#section-5 + // ie Fri, 07 Feb 2020 18:58:00 +0000 + // ^^^^ ^^^^^ + // optional ^^^^^ + // could also be GMT/UT/EST/A..I/M..Z + + char *ptr; + time_t t; + + memset(tm, 0, sizeof(struct tm)); + ptr = strptime(date, "%a,%n", tm); + ptr = strptime(ptr ? ptr : date, "%d%n%b%n%Y%n%H:%M:%S%n", tm); + if (!ptr) + { + // date is junk, using current time + time(&t); + gmtime_r(&t, tm); + return false; + } + + // TODO - adjust the TZ? + return true; +} + +// uses incoming buf for result but if too smal, returns new buf +static char* +process_apple_rss(const char *rss_url) +{ + struct http_client_ctx ctx; + struct evbuffer *evbuf; + char url[100]; + char *buf = NULL; + unsigned podid; // apple podcast id + json_object *json = NULL; + json_object *jsonra = NULL; + const char *feedURL; + const char *ptr; + int ret; + + // ask for the json to get feedUrl + // https://itunes.apple.com/lookup?id=974722423 + + ptr = strrchr(rss_url, '/'); + if (!ptr) + { + DPRINTF(E_LOG, L_LIB, "Could not parse Apple Podcast RSS ID from '%s'\n", rss_url); + return NULL; + } + if (sscanf(ptr, "/id%u", &podid) != 1) + { + DPRINTF(E_LOG, L_LIB, "Could not parse Apple Podcast RSS ID from '%s'\n", rss_url); + return NULL; + } + + evbuf = evbuffer_new(); + if (!evbuf) + return false; + + snprintf(url, sizeof(url), "https://itunes.apple.com/lookup?id=%u", podid); + + memset(&ctx, 0, sizeof(struct http_client_ctx)); + ctx.url = url; + ctx.input_body = evbuf; + + ret = http_client_request(&ctx); + if (ret < 0 || (ret && ctx.response_code != HTTP_OK)) + { + evbuffer_free(evbuf); + return NULL; + } + + json = jparse_obj_from_evbuffer(evbuf); + if (!json) + { + DPRINTF(E_LOG, L_LIB, "Could not parse RSS apple response, podcast id %u\n", podid); + } + else + { + /* expect json resp - get feedUrl + * { + * "resultCount": 1, + * "results": [ + * { + * "wrapperType": "track", + * "kind": "podcast", + * ... + * "collectionViewUrl": "https://podcasts.apple.com/us/podcast/cgp-grey/id974722423?uo=4", + * "feedUrl": "http://cgpgrey.libsyn.com/rss", + * ... + * "genres": [ + * "Education", + * "Podcasts", + * "News" + * ] + * } + * ] + *} + */ + if (json_object_object_get_ex(json, "results", &jsonra) && (feedURL = jparse_str_from_array(jsonra, 0, "feedUrl")) ) + { + buf = strcpy(malloc(strlen(feedURL)+1), feedURL); + DPRINTF(E_DBG, L_LIB, "mapped apple podcast URL: %s -> %s\n", rss_url, buf); + } + else + DPRINTF(E_DBG, L_LIB, "Could not parse feedURL from RSS apple, podcast id %u\n", podid); + } + + jparse_free(json); + evbuffer_free(evbuf); + return buf; +} + +#ifdef RSS_DEBUG +static void +rss_playlist_items(int plid) +{ + struct query_params qp; + struct db_media_file_info dbpli; + int ret; + + memset(&qp, 0, sizeof(struct query_params)); + + qp.type = Q_PLITEMS; + qp.idx_type = I_NONE; + qp.id = plid; + + ret = db_query_start(&qp); + if (ret < 0) + { + db_query_end(&qp); + return; + } + while (((ret = db_query_fetch_file(&qp, &dbpli)) == 0) && (dbpli.id)) + { + DPRINTF(E_LOG, L_LIB, "plid=%u { id=%s title=%s path=%s }\n", plid, dbpli.id, dbpli.title, dbpli.path); + } + db_query_end(&qp); + + return; +} +#endif + +static int +map_rss_item_to_mfi(struct media_file_info *mfi, int pl_id, const char *rss_item_url, const char *rss_item_type, const char *rss_feed_author, const char *rss_feed_title, const char *rss_item_title, const char *rss_item_link, const char *rss_item_pubDate, time_t mtime) +{ + struct tm tm; + + memset(mfi, 0, sizeof(struct media_file_info)); + scan_metadata_stream(mfi, rss_item_url); + + if (mfi->song_length == 0 && mfi->file_size == 0) + { + DPRINTF(E_INFO, L_LIB, "Ignoring item (empty media) RSS id: %d name: '%s' url: %s pubdate: %s title: '%s'\n", pl_id, rss_feed_title, rss_item_url, rss_item_pubDate, rss_item_title); + return -1; + } + + // Always take the meta from media file if possible; some podcasts + // (apple) can use mp4 streams which tend not to have decent tags so + // in those cases take info from the RSS and not the stream + if (!mfi->artist) mfi->artist = safe_strdup(rss_feed_author); + if (!mfi->album) mfi->album = safe_strdup(rss_feed_title); + if (!mfi->url) mfi->url = safe_strdup(rss_item_link); + if (!mfi->genre || strcmp("(186)Podcast", mfi->genre) == 0) + { + free(mfi->genre); + mfi->genre = strdup("Podcast"); + } + + + // Title not valid on most mp4 (it becomes the url obj) so take from RSS feed + if (rss_item_type && strncmp("video", rss_item_type, 5) == 0) + { + free(mfi->title); + mfi->title = safe_strdup(rss_item_title); + } + + // Ignore this - some can be very verbose - we don't show use these + // on the podcast + free(mfi->comment); mfi->comment = NULL; + + // date is always from the RSS feed info + rss_date(&tm, rss_item_pubDate); + mfi->date_released = mktime(&tm); + mfi->year = 1900 + tm.tm_year; + + mfi->media_kind = MEDIA_KIND_PODCAST; + + // Fake the time - useful when we are adding a new stream - since the + // newest podcasts are added first (the stream is most recent first) + // having time_added date which is older on the most recent episodes + // makes no sense so make all the dates the same for a singleu update + mfi->time_added = mtime; + + mfi->id = db_file_id_bypath(rss_item_url); + + return 0; +} + +// Only add required number of feeds items when limit > 0 +int +rss_scan_feed(int pl_id, const char *url, long limit, unsigned *nadded) +{ + struct media_file_info mfi; + char *vpath = NULL; + int feed_file_id; + unsigned vpathlen = 0; + unsigned len = 0; + time_t mtime; + + char *apple_url = NULL; + + const char *rss_xml = NULL; + mxml_node_t *tree = NULL; + mxml_node_t *channel; + mxml_node_t *node; + mxml_node_t *item; + const char *rss_feed_title = NULL; + const char *rss_feed_author = NULL; + const char *rss_item_title = NULL; + const char *rss_item_pubDate = NULL; + const char *rss_item_url = NULL; + const char *rss_item_link = NULL; + const char *rss_item_type = NULL; + + struct http_client_ctx ctx; + struct evbuffer *evbuf; + + int ret = -1; + + DPRINTF(E_DBG, L_LIB, "Refreshing RSS id: %u url: %s limit: %ld\n", pl_id, url, limit); + db_pl_ping(pl_id); + db_pl_ping_items_bymatch("http://", pl_id); + db_pl_ping_items_bymatch("https://", pl_id); + + evbuf = evbuffer_new(); + if (!evbuf) + goto cleanup; + + // Is it an apple podcast stream? + // ie https://podcasts.apple.com/is/podcast/cgp-grey/id974722423 + if (strncmp(url, "https://podcasts.apple.com/", 27) == 0) + apple_url = process_apple_rss(url); + + memset(&ctx, 0, sizeof(struct http_client_ctx)); + ctx.url = apple_url ? apple_url : url; + ctx.input_body = evbuf; + + ret = http_client_request(&ctx); + if (ret < 0 || (ret && ctx.response_code != HTTP_OK)) + { + DPRINTF(E_WARN, L_LIB, "Failed to fetch RSS id: %u url: %s resp: %d\n", pl_id, url, ctx.response_code); + ret = -1; + goto cleanup; + } + + ret = -1; + + evbuffer_add(ctx.input_body, "", 1); + rss_xml = (const char*)evbuffer_pullup(ctx.input_body, -1); + if (!rss_xml || strlen(rss_xml) == 0) + { + DPRINTF(E_WARN, L_LIB, "Failed to fetch valid RSS/xml data RSS id: %u url: %sn", pl_id, url); + ret = LIBRARY_PATH_INVALID; + goto cleanup; + } + + tree = mxmlLoadString(NULL, rss_xml, MXML_OPAQUE_CALLBACK); + + channel = mxmlFindElement(tree, tree, "channel", NULL, NULL, MXML_DESCEND); + if (channel == NULL) + { + DPRINTF(E_WARN, L_LIB, "Invalid RSS/xml, missing 'channel' node - RSS id: %u url: %s\n", pl_id, url); + DPRINTF(E_DBG, L_LIB, "RSS xml len: %ld xml: { %s }\n", strlen(rss_xml), rss_xml); + ret = LIBRARY_PATH_INVALID; + goto cleanup; + } + + node = mxmlFindElement(channel, channel, "title", NULL, NULL, MXML_DESCEND); + if (!node) + { + DPRINTF(E_WARN, L_LIB, "Invalid RSS/xml, missing 'title' - RSS id: %u url: %s\n", pl_id, url); + ret = LIBRARY_PATH_INVALID; + goto cleanup; + } + rss_feed_title = mxmlGetOpaque(node); + + node = mxmlFindElement(channel, channel, "itunes:author", NULL, NULL, MXML_DESCEND); + if (node) + rss_feed_author = mxmlGetOpaque(node); + + time(&mtime); + ret = 0; + memset(&mfi, 0, sizeof(struct media_file_info)); + for (node = mxmlFindElement(channel, channel, "item", NULL, NULL, MXML_DESCEND); + node != NULL; + node = mxmlFindElement(node, channel, "item", NULL, NULL, MXML_DESCEND)) + { + if (library_is_exiting()) + { + DPRINTF(E_WARN, L_LIB, "Abandoning RSS feed refresh due to library exit, will need to rollback pl: %d url: %s\n", pl_id, url); + ret = LIBRARY_ERROR; + break; + } + + item = mxmlFindElement(node, node, "title", NULL, NULL, MXML_DESCEND); + rss_item_title = mxmlGetOpaque(item); + + item = mxmlFindElement(node, node, "pubDate", NULL, NULL, MXML_DESCEND); + rss_item_pubDate = mxmlGetOpaque(item); + + item = mxmlFindElement(node, node, "link", NULL, NULL, MXML_DESCEND); + rss_item_link = mxmlGetOpaque(item); + + item = mxmlFindElement(node, node, "enclosure", NULL, NULL, MXML_DESCEND); + rss_item_url = mxmlElementGetAttr(item, "url"); + rss_item_type = mxmlElementGetAttr(item, "type"); + + DPRINTF(E_DBG, L_LIB, "Feed provides RSS id: %d name: '%s' pubDate: %s url: %s title: '%s'\n", pl_id, rss_feed_title, rss_item_pubDate, rss_item_url, rss_item_title); + if (!rss_item_url) + continue; + + + len = strlen(rss_item_url)+2; + if (len > vpathlen) + { + vpathlen = len; + free(vpath); + vpath = malloc(len); + } + sprintf(vpath, "/%s", rss_item_url); + + // check if this item is already in the db - if so, we can stop since the RSS is given to us as LIFO stream + if ((feed_file_id = db_file_id_by_virtualpath_match(vpath)) > 0) + { + DPRINTF(E_DBG, L_LIB, "Most recent DB RSS id: %d name: '%s' url: %s file_id: %d pubdate: %s title: '%s'\n", pl_id, rss_feed_title, url, feed_file_id, rss_item_pubDate, rss_item_title); + break; + } + DPRINTF(E_INFO, L_LIB, "Adding item to RSS id: %d name: '%s' url: %s pubdate: %s title: '%s'\n", pl_id, rss_feed_title, rss_item_url, rss_item_pubDate, rss_item_title); + + ret = map_rss_item_to_mfi(&mfi, pl_id, rss_item_url, rss_item_type, rss_feed_author, rss_feed_title, rss_item_title, rss_item_link, rss_item_pubDate, mtime); + if (ret < 0) + { + free_mfi(&mfi, 1); + continue; + } + + ret = library_media_save(&mfi); + free_mfi(&mfi, 1); + if (ret < 0) + { + DPRINTF(E_INFO, L_LIB, "Failed to save item for RSS %s\n", url); + break; + } + ret = db_pl_add_item_bypath(pl_id, rss_item_url); + if (ret < 0) + { + DPRINTF(E_LOG, L_LIB, "Failed to add item for RSS %s\n", url); + break; + } + + *nadded = *nadded +1; + if (*nadded%50 == 0) + { + DPRINTF(E_INFO, L_LIB, "RSS added %d entries...\n", *nadded); + } + + if (limit > 0 && *nadded == limit) + { + DPRINTF(E_INFO, L_LIB, "RSS added limit reached, added %d entries...\n", *nadded); + break; + } + } + + +cleanup: + evbuffer_free(evbuf); + mxmlDelete(tree); + free(vpath); + free(apple_url); + + return ret; +} + + +static int +rss_item_add(const char *name, const char *path, int limit) +{ + int pl_id = -1; + struct playlist_info *pli; + struct playlist_info newpli; + time_t now; + unsigned nadded = 0; + int ret = 0; + + DPRINTF(E_DBG, L_LIB, "RSS working on: '%s' '%s'\n", name, path); + if (strncmp(path, "http://", 7) != 0 && strncmp(path, "https://", 8) != 0) + { + DPRINTF(E_LOG, L_LIB, "Invalid RSS path '%s'\n", path); + return -1; + } + + time(&now); + + pli = db_pl_fetch_bypath(path); + if (pli) + { + DPRINTF(E_LOG, L_LIB, "Duplicate RSS exists id: %d path: %s\n", pli->id, path); + free_pli(pli, 0); + return LIBRARY_ERROR; + } + + memset(&newpli, 0, sizeof(struct playlist_info)); + + newpli.type = PL_RSS; + newpli.path = strdup(path); + newpli.title = strdup(name); + newpli.virtual_path = malloc(strlen(path)+2); + sprintf(newpli.virtual_path, "/%s", path); + newpli.directory_id = DIR_HTTP; + + db_transaction_begin(); + pl_id = library_playlist_save(&newpli); + free_pli(&newpli, 1); + if (pl_id < 0) + { + DPRINTF(E_LOG, L_LIB, "Failed to create RSS id path: %s\n", path); + ret = -1; + goto rollback_error; + } + + DPRINTF(E_INFO, L_LIB, "New RSS, created id: %d path: %s\n", pl_id, path); + + // Determine if its really for us -- if not return LIBRARY_PATH_INVALID + ret = rss_scan_feed(pl_id, path, limit, &nadded); + if (ret < 0) + { + DPRINTF(E_LOG, L_LIB, "Failed to add RSS, dropping id: %d path: %s\n", pl_id, path); + goto rollback_error; + } + db_transaction_end(); + DPRINTF(E_LOG, L_LIB, "Done processing RSS %s added %u items\n", path, nadded); + + return LIBRARY_OK; + +rollback_error: + db_transaction_rollback(); + return ret; +} + +static void +rss_protect_feeds() +{ + struct query_params query_params; + struct db_playlist_info dbpli; + unsigned feeds = 0; + int pl_id; + int ret = 0; + + memset(&query_params, 0, sizeof(struct query_params)); + + DPRINTF(E_DBG, L_LIB, "Protecting RSS feeds\n"); + + query_params.type = Q_PL; + query_params.sort = S_PLAYLIST; + query_params.filter = db_mprintf("(f.type = %d)", PL_RSS); + + ret = db_query_start(&query_params); + if (ret < 0) + { + DPRINTF(E_LOG, L_LIB, "Failed to find current RSS feeds from db\n"); + goto error; + } + + while (((ret = db_query_fetch_pl(&query_params, &dbpli)) == 0) && (dbpli.id)) + { + pl_id = atoi(dbpli.id); + + DPRINTF(E_DBG, L_LIB, "Protecting feed id: %d '%s' at %s\n", pl_id, dbpli.title, dbpli.path); + + db_pl_ping(pl_id); + db_pl_ping_items_bymatch("http://", pl_id); + db_pl_ping_items_bymatch("https://", pl_id); + + ++feeds; + } + db_query_end(&query_params); + + DPRINTF(E_DBG, L_LIB, "Completed protecing RSS feeds: %u\n", feeds); + + error: + free(query_params.filter); +} + + +static int +rss_refresh() +{ + struct query_params query_params; + struct db_playlist_info dbpli; + unsigned feeds = 0; + unsigned nadded = 0; + int pl_id; + int ret = 0; + + memset(&query_params, 0, sizeof(struct query_params)); + + DPRINTF(E_INFO, L_LIB, "Refreshing RSS feeds\n"); + scanning = true; + + query_params.type = Q_PL; + query_params.sort = S_PLAYLIST; + query_params.filter = db_mprintf("(f.type = %d)", PL_RSS); + + ret = db_query_start(&query_params); + if (ret < 0) + { + DPRINTF(E_LOG, L_LIB, "Failed to find current RSS feeds from db\n"); + goto error; + } + + while (((ret = db_query_fetch_pl(&query_params, &dbpli)) == 0) && (dbpli.id)) + { + if (library_is_exiting()) + { + pl_id = atoi(dbpli.id); + + DPRINTF(E_DBG, L_LIB, "library is exiting, protecting feed id: %d '%s' at %s\n", pl_id, dbpli.title, dbpli.path); + + db_pl_ping(pl_id); + db_pl_ping_items_bymatch("http://", pl_id); + db_pl_ping_items_bymatch("https://", pl_id); + } + else + { + DPRINTF(E_DBG, L_LIB, "refreshing '%s' url: '%s' last update: %s", dbpli.title, dbpli.path, dbpli.db_timestamp); + + db_transaction_begin(); + ret = rss_scan_feed(atol(dbpli.id), dbpli.path, -1, &nadded); + if (ret < 0) + { + db_transaction_rollback(); + if (!library_is_exiting()) + break; + + pl_id = atoi(dbpli.id); + + DPRINTF(E_DBG, L_LIB, "rolled back RSS update, library is exiting, protecting feed id: %d '%s' at %s\n", pl_id, dbpli.title, dbpli.path); + + db_pl_ping(pl_id); + db_pl_ping_items_bymatch("http://", pl_id); + db_pl_ping_items_bymatch("https://", pl_id); + continue; + } + db_transaction_end(); + + ++feeds; + } + } + db_query_end(&query_params); + scanning = false; + + DPRINTF(E_INFO, L_LIB, "%s RSS refresh, feeds: %u items: %u\n", ret == 0 ? "Completed" : "Partial", feeds, nadded); + + error: + free(query_params.filter); + + evtimer_add(rssev, &rss_refresh_interval); + return ret; +} + +static void +rss_refresh_cb(int fd, short what, void *arg) +{ + rss_refresh(); +} + +/* Thread: library */ +static int +rss_rescan() +{ + time_t start; + time_t end; + int ret; + + if (scanning) + { + DPRINTF(E_DBG, L_LIB, "Scan already in progress, rescan ignored\n"); + return 0; + } + + start = time(NULL); + scanning = true; + + ret = rss_refresh(); + + scanning = false; + end = time(NULL); + + DPRINTF(E_LOG, L_LIB, "RSS scan completed in %.f sec\n", difftime(end, start)); + return ret; +} + +static int +rss_metarescan() +{ + time_t start; + time_t end; + + if (scanning) + { + DPRINTF(E_DBG, L_LIB, "Scan already in progress, meta rescan ignored\n"); + return 0; + } + + start = time(NULL); + scanning = true; + + rss_protect_feeds(); + + scanning = false; + end = time(NULL); + + DPRINTF(E_LOG, L_LIB, "RSS meta scan completed in %.f sec\n", difftime(end, start)); + return 0; +} + +static int +rss_fullrescan() +{ + DPRINTF(E_LOG, L_LIB, "RSS fullscan not implemented - RSS feeds will be lost\n"); + return 0; +} + +int +rss_item_remove(const char *url) +{ + struct playlist_info *pli; + int ret; + + DPRINTF(E_DBG, L_LIB, "removing RSS: '%s'\n", url); + + pli = db_pl_fetch_bypath(url); + if (!pli) + { + DPRINTF(E_INFO, L_LIB, "Cannot remove RSS - No such RSS feed: '%s'\n", url); + return LIBRARY_ERROR; + } + + if (pli->type == PL_RSS) + ret = db_pl_purge_byid(pli->id); + else + ret = LIBRARY_PATH_INVALID; + + free_pli(pli, 0); + return ret; +} + + +static int +init() +{ + DPRINTF(E_INFO, L_LIB, "RSS refresh_period: %lu seconds\n", rss_refresh_interval.tv_sec); + + scanning = false; + rssev = library_register_event(rss_refresh_cb, NULL, &rss_refresh_interval); + + return 0; +} + +static void +deinit() +{ + event_free(rssev); +} + +struct library_source rssscanner = +{ + .name = "RSS feed source", + .disabled = 0, + .init = init, + .deinit = deinit, + .rescan = rss_rescan, + .metarescan = rss_metarescan, + .initscan = rss_rescan, + .fullrescan = rss_fullrescan, + .item_add = rss_item_add, + .item_remove = rss_item_remove, +};