owntone-server/src/transcode.c

/*
 * Copyright (C) 2015-17 Espen Jurgensen
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#ifdef HAVE_CONFIG_H
# include <config.h>
#endif

#include <stdio.h>
#include <stdbool.h>
#include <string.h>
#include <unistd.h>

#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libavfilter/avfilter.h>
#include <libavfilter/buffersink.h>
#include <libavfilter/buffersrc.h>
#include <libavutil/opt.h>
#include <libavutil/time.h>
#include <libavutil/pixdesc.h>
#include <libavutil/channel_layout.h>
#include <libavutil/mathematics.h>

#include "logger.h"
#include "conffile.h"
#include "misc.h"
#include "transcode.h"

// Switches for compability with ffmpeg's ever changing API
#define USE_IMAGE2PIPE (LIBAVFORMAT_VERSION_MAJOR > 58) || ((LIBAVFORMAT_VERSION_MAJOR == 58) && (LIBAVFORMAT_VERSION_MINOR > 29))
#define USE_CONST_AVFORMAT (LIBAVFORMAT_VERSION_MAJOR > 59) || ((LIBAVFORMAT_VERSION_MAJOR == 59) && (LIBAVFORMAT_VERSION_MINOR > 15))
#define USE_CONST_AVCODEC (LIBAVFORMAT_VERSION_MAJOR > 59) || ((LIBAVFORMAT_VERSION_MAJOR == 59) && (LIBAVFORMAT_VERSION_MINOR > 15))
#define USE_NO_CLEAR_AVFMT_NOFILE (LIBAVFORMAT_VERSION_MAJOR > 59) || ((LIBAVFORMAT_VERSION_MAJOR == 59) && (LIBAVFORMAT_VERSION_MINOR > 15))
#define USE_CH_LAYOUT (LIBAVCODEC_VERSION_MAJOR > 59) || ((LIBAVCODEC_VERSION_MAJOR == 59) && (LIBAVCODEC_VERSION_MINOR > 24))
#define USE_CONST_AVIO_WRITE_PACKET (LIBAVFORMAT_VERSION_MAJOR > 61) || ((LIBAVFORMAT_VERSION_MAJOR == 61) && (LIBAVFORMAT_VERSION_MINOR > 0))

// Interval between ICY metadata checks for streams, in seconds
#define METADATA_ICY_INTERVAL 5
// Maximum number of streams in a file that we will accept
#define MAX_STREAMS 64
// Maximum number of times we retry when we encounter bad packets
#define MAX_BAD_PACKETS 5
// How long to wait (in microsec) before interrupting av_read_frame
#define READ_TIMEOUT 30000000
// Buffer size for reading/writing input and output evbuffers
#define AVIO_BUFFER_SIZE 4096
// Size of the wav header that iTunes needs
#define WAV_HEADER_LEN 44
// Max filters in a filtergraph
#define MAX_FILTERS 9
// Set to same size as in httpd.c (but can be set to something else)
#define STREAM_CHUNK_SIZE (64 * 1024)

static const char *default_codecs = "mpeg,alac,wav";
static const char *roku_codecs = "mpeg,mp4a,wma,alac,wav";
static const char *itunes_codecs = "mpeg,mp4a,mp4v,alac,wav";

// Used for passing errors to DPRINTF (can't count on av_err2str being present)
static char errbuf[64];

// Used by dummy_seek to mark a seek requested by ffmpeg
static const uint8_t xcode_seek_marker[8] = { 0x0D, 0x0E, 0x0A, 0x0D, 0x0B, 0x0E, 0x0E, 0x0F };

// The settings struct will be filled out based on the profile enum
struct settings_ctx
{
  bool encode_video;
  bool encode_audio;

  // Silence some log messages
  bool silent;

  // Output format (for the muxer)
  const char *format;

  // Input format (for the demuxer)
  const char *in_format;

  // Audio settings
  enum AVCodecID audio_codec;
  int sample_rate;
#if USE_CH_LAYOUT
  AVChannelLayout channel_layout;
#else
  uint64_t channel_layout;
#endif
  int nb_channels;
  int bit_rate;
  int frame_size;
  enum AVSampleFormat sample_format;
  bool with_mp4_header;
  bool with_wav_header;
  bool without_libav_header;
  bool without_libav_trailer;
  bool with_icy;
  bool with_user_filters;

  // Video settings
  enum AVCodecID video_codec;
  const char *video_codec_name;
  enum AVPixelFormat pix_fmt;
  int height;
  int width;
};

struct stream_ctx
{
  AVStream *stream;
  AVCodecContext *codec;

  AVFilterContext *buffersink_ctx;
  AVFilterContext *buffersrc_ctx;
  AVFilterGraph *filter_graph;

  // Used for seeking
  int64_t prev_pts;
  int64_t offset_pts;
};

struct decode_ctx
{
  // Settings derived from the profile
  struct settings_ctx settings;

  // Input format context
  AVFormatContext *ifmt_ctx;

  // IO Context for non-file input
  AVIOContext *avio;

  // Stream and decoder data
  struct stream_ctx audio_stream;
  struct stream_ctx video_stream;

  // Source duration in ms as provided by caller
  uint32_t len_ms;

  // Used to determine if ICY metadata is relevant to look for
  bool is_http;

  // Set to true if we just seeked
  bool resume;

  // Set to true if we have reached eof
  bool eof;

  // Set to true if avcodec_receive_frame() gave us a frame
  bool got_frame;

  // Contains the most recent packet from av_read_frame()
  AVPacket *packet;

  // Contains the most recent frame from avcodec_receive_frame()
  AVFrame *decoded_frame;

  // Used to measure if av_read_frame is taking too long
  int64_t timestamp;
};

struct encode_ctx
{
  // Settings derived from the profile
  struct settings_ctx settings;

  // Output format context
  AVFormatContext *ofmt_ctx;

  // Stream, filter and decoder data
  struct stream_ctx audio_stream;
  struct stream_ctx video_stream;

  // The ffmpeg muxer writes to this buffer using the avio_evbuffer interface
  struct evbuffer *obuf;

  // IO Context for non-file output
  struct transcode_evbuf_io evbuf_io;

  // Contains the most recent packet from av_buffersink_get_frame()
  AVFrame *filt_frame;

  // Contains the most recent packet from avcodec_receive_packet()
  AVPacket *encoded_pkt;

  // How many output bytes we have processed in total
  off_t bytes_processed;

  // Estimated total size of output
  off_t bytes_total;

  // Used to check for ICY metadata changes at certain intervals
  uint32_t icy_interval;
  uint32_t icy_hash;
};

enum probe_type
{
  PROBE_TYPE_DEFAULT,
  PROBE_TYPE_QUICK,
};

struct avio_evbuffer {
  struct evbuffer *evbuf;
  uint8_t *buffer;
  transcode_seekfn seekfn;
  void *seekfn_arg;
};

struct filter_def
{
  char name[64];
  char args[512];
};

struct filters
{
  AVFilterContext *av_ctx;

  // Function that will create the filter arguments for ffmpeg
  int (*deffn)(struct filter_def *, struct stream_ctx *, struct stream_ctx *, const char *);
  const char *deffn_arg;
};


/* -------------------------- PROFILE CONFIGURATION ------------------------ */

static int
init_settings(struct settings_ctx *settings, enum transcode_profile profile, struct media_quality *quality)
{
  memset(settings, 0, sizeof(struct settings_ctx));

  switch (profile)
    {
      case XCODE_PCM_NATIVE: // Sample rate and bit depth determined by source
	settings->encode_audio = true;
	settings->with_icy = true;
	settings->with_user_filters = true;
	break;

      case XCODE_WAV:
	settings->with_wav_header = true;
	settings->with_user_filters = true;
      case XCODE_PCM16:
	settings->encode_audio = true;
	settings->format = "s16le";
	settings->audio_codec = AV_CODEC_ID_PCM_S16LE;
	settings->sample_format = AV_SAMPLE_FMT_S16;
	break;

      case XCODE_PCM24:
	settings->encode_audio = true;
	settings->format = "s24le";
	settings->audio_codec = AV_CODEC_ID_PCM_S24LE;
	settings->sample_format = AV_SAMPLE_FMT_S32;
	break;

      case XCODE_PCM32:
	settings->encode_audio = true;
	settings->format = "s32le";
	settings->audio_codec = AV_CODEC_ID_PCM_S32LE;
	settings->sample_format = AV_SAMPLE_FMT_S32;
	break;

      case XCODE_MP3:
	settings->encode_audio = true;
	settings->format = "mp3";
	settings->audio_codec = AV_CODEC_ID_MP3;
	settings->sample_format = AV_SAMPLE_FMT_S16P;
	break;

      case XCODE_OPUS:
	settings->encode_audio = true;
	settings->format = "data"; // Means we get the raw packet from the encoder, no muxing
	settings->audio_codec = AV_CODEC_ID_OPUS;
	settings->sample_format = AV_SAMPLE_FMT_S16; // Only libopus support
	break;

      case XCODE_ALAC:
	settings->encode_audio = true;
	settings->format = "data"; // Means we get the raw packet from the encoder, no muxing
	settings->audio_codec = AV_CODEC_ID_ALAC;
	settings->sample_format = AV_SAMPLE_FMT_S16P;
	settings->frame_size = 352;
	break;

      case XCODE_MP4_ALAC:
	settings->with_mp4_header = true;
	settings->encode_audio = true;
	settings->format = "data";
	settings->audio_codec = AV_CODEC_ID_ALAC;
	break;

      case XCODE_MP4_ALAC_HEADER:
	settings->without_libav_header = true;
	settings->without_libav_trailer = true;
	settings->encode_audio = true;
	settings->format = "ipod"; // ffmpeg default mp4 variant ("mp4" doesn't work with SoundBridge because of the btrt atom in the header)
	settings->audio_codec = AV_CODEC_ID_ALAC;
	break;

      case XCODE_OGG:
	settings->encode_audio = true;
	settings->in_format = "ogg";
	break;

      case XCODE_JPEG:
	settings->encode_video = true;
	settings->silent = 1;
// With ffmpeg 4.3 (> libavformet 58.29) "image2" only works for actual file
// output. It's possible we should have used "image2pipe" all along, but since
// "image2" has been working we only replace it going forward.
#if USE_IMAGE2PIPE
	settings->format = "image2pipe";
#else
	settings->format = "image2";
#endif

	settings->in_format = "mjpeg";
	settings->pix_fmt = AV_PIX_FMT_YUVJ420P;
	settings->video_codec = AV_CODEC_ID_MJPEG;
	break;

      case XCODE_PNG:
	settings->encode_video = true;
	settings->silent = true;
// See explanation above
#if USE_IMAGE2PIPE
	settings->format = "image2pipe";
#else
	settings->format = "image2";
#endif
	settings->pix_fmt = AV_PIX_FMT_RGB24;
	settings->video_codec = AV_CODEC_ID_PNG;
	break;

      case XCODE_VP8:
	settings->encode_video = true;
	settings->silent = true;
// See explanation above
#if USE_IMAGE2PIPE
	settings->format = "image2pipe";
#else
	settings->format = "image2";
#endif
	settings->pix_fmt = AV_PIX_FMT_YUVJ420P;
	settings->video_codec = AV_CODEC_ID_VP8;
	break;

      default:
	DPRINTF(E_LOG, L_XCODE, "Bug! Unknown transcoding profile\n");
	return -1;
    }

  if (quality && quality->sample_rate)
    {
      settings->sample_rate    = quality->sample_rate;
    }

  if (quality && quality->channels)
    {
#if USE_CH_LAYOUT
      av_channel_layout_default(&settings->channel_layout, quality->channels);
#else
      settings->channel_layout = av_get_default_channel_layout(quality->channels);
      settings->nb_channels    = quality->channels;
#endif
    }

  if (quality && quality->bit_rate)
    {
      settings->bit_rate    = quality->bit_rate;
    }

  if (quality && quality->bits_per_sample && (quality->bits_per_sample != 8 * av_get_bytes_per_sample(settings->sample_format)))
    {
      DPRINTF(E_LOG, L_XCODE, "Bug! Mismatch between profile (%d bps) and media quality (%d bps)\n", 8 * av_get_bytes_per_sample(settings->sample_format), quality->bits_per_sample);
      return -1;
    }

  return 0;
}

static int
init_settings_from_video(struct settings_ctx *settings, enum transcode_profile profile, struct decode_ctx *src_ctx, int width, int height)
{
  settings->width = width;
  settings->height = height;

  return 0;
}

static int
init_settings_from_audio(struct settings_ctx *settings, enum transcode_profile profile, struct decode_ctx *src_ctx, struct media_quality *quality)
{
  int src_bytes_per_sample = av_get_bytes_per_sample(src_ctx->audio_stream.codec->sample_fmt);

  // Initialize unset settings that are source-dependent, not profile-dependent
  if (!settings->sample_rate)
    settings->sample_rate = src_ctx->audio_stream.codec->sample_rate;

#if USE_CH_LAYOUT
  if (!av_channel_layout_check(&settings->channel_layout))
    av_channel_layout_copy(&settings->channel_layout, &src_ctx->audio_stream.codec->ch_layout);

  settings->nb_channels = settings->channel_layout.nb_channels;
#else
  if (settings->nb_channels == 0)
    {
      settings->nb_channels = src_ctx->audio_stream.codec->channels;
      settings->channel_layout = src_ctx->audio_stream.codec->channel_layout;
    }
#endif

  // Initialize settings that are both source-dependent and profile-dependent
  switch (profile)
    {
      case XCODE_MP4_ALAC:
      case XCODE_MP4_ALAC_HEADER:
	if (!settings->sample_format)
	  settings->sample_format = (src_bytes_per_sample == 4) ? AV_SAMPLE_FMT_S32P : AV_SAMPLE_FMT_S16P;
	break;

      case XCODE_PCM_NATIVE:
	if (!settings->sample_format)
	  settings->sample_format = (src_bytes_per_sample == 4) ? AV_SAMPLE_FMT_S32 : AV_SAMPLE_FMT_S16;
	if (!settings->audio_codec)
	  settings->audio_codec = (src_bytes_per_sample == 4) ? AV_CODEC_ID_PCM_S32LE : AV_CODEC_ID_PCM_S16LE;
	if (!settings->format)
	  settings->format = (src_bytes_per_sample == 4) ? "s32le" : "s16le";
	break;

      default:
	if (settings->sample_format && settings->audio_codec && settings->format)
	  return 0;

	DPRINTF(E_LOG, L_XCODE, "Bug! Profile %d has unset encoding parameters\n", profile);
	return -1;
    }

  return 0;
}

static void
stream_settings_set(struct stream_ctx *s, struct settings_ctx *settings, enum AVMediaType type)
{
  if (type == AVMEDIA_TYPE_AUDIO)
    {
      s->codec->sample_rate    = settings->sample_rate;
#if USE_CH_LAYOUT
      av_channel_layout_copy(&s->codec->ch_layout, &(settings->channel_layout));
#else
      s->codec->channel_layout = settings->channel_layout;
      s->codec->channels       = settings->nb_channels;
#endif
      s->codec->sample_fmt     = settings->sample_format;
      s->codec->time_base      = (AVRational){1, settings->sample_rate};
      s->codec->bit_rate       = settings->bit_rate;
    }
  else if (type == AVMEDIA_TYPE_VIDEO)
    {
      s->codec->height         = settings->height;
      s->codec->width          = settings->width;
      s->codec->pix_fmt        = settings->pix_fmt;
      s->codec->time_base      = (AVRational){1, 25};
    }
}


/* -------------------------------- HELPERS -------------------------------- */

static enum AVSampleFormat
bitdepth2format(int bits_per_sample)
{
  if (bits_per_sample == 16)
    return AV_SAMPLE_FMT_S16;
  else if (bits_per_sample == 24)
    return AV_SAMPLE_FMT_S32;
  else if (bits_per_sample == 32)
    return AV_SAMPLE_FMT_S32;
  else
    return AV_SAMPLE_FMT_NONE;
}

static inline char *
err2str(int errnum)
{
  av_strerror(errnum, errbuf, sizeof(errbuf));
  return errbuf;
}

static inline void
add_le16(uint8_t *dst, uint16_t val)
{
  dst[0] = val & 0xff;
  dst[1] = (val >> 8) & 0xff;
}

static inline void
add_le32(uint8_t *dst, uint32_t val)
{
  dst[0] = val & 0xff;
  dst[1] = (val >> 8) & 0xff;
  dst[2] = (val >> 16) & 0xff;
  dst[3] = (val >> 24) & 0xff;
}

// Copies the src buffer to position pos of the dst buffer, expanding dst if
// needed to fit src. Can be called with *dst = NULL and *dst_len = 0. Returns
// the number of bytes dst was expanded with.
static int
copy_buffer_to_position(uint8_t **dst, size_t *dst_len, uint8_t *src, size_t src_len, int64_t pos)
{
  int bytes_added = 0;

  if (pos < 0 || pos > *dst_len)
    return -1; // Out of bounds
  if (src_len == 0)
    return 0; // Nothing to do

  if (pos + src_len > *dst_len)
    {
      bytes_added = pos + src_len - *dst_len;
      *dst_len += bytes_added;
      CHECK_NULL(L_XCODE, *dst = realloc(*dst, *dst_len));
    }

  memcpy(*dst + pos, src, src_len);
  return bytes_added;
}

// Doesn't actually seek, just inserts a marker in the obuf
static int64_t
dummy_seek(void *arg, int64_t offset, enum transcode_seek_type type)
{
  struct transcode_ctx *ctx = arg;
  struct encode_ctx *enc_ctx = ctx->encode_ctx;

  if (type == XCODE_SEEK_SET)
    {
      evbuffer_add(enc_ctx->obuf, xcode_seek_marker, sizeof(xcode_seek_marker));
      evbuffer_add(enc_ctx->obuf, &offset, sizeof(offset));
      return offset;
    }
  else if (type == XCODE_SEEK_SIZE)
    return enc_ctx->bytes_total;

  return -1;
}

static off_t
size_estimate(enum transcode_profile profile, uint32_t bit_rate, uint32_t sample_rate, uint16_t bytes_per_sample, uint16_t channels, uint32_t len_ms)
{
  off_t bytes = 0;
  uint64_t nsamples;

  if (len_ms == 0)
    len_ms = 3 * 60 * 1000;

  nsamples = (uint64_t)sample_rate * (uint64_t)len_ms / 1000 + 1; // The +1 is to round up

  if (profile == XCODE_WAV)
    bytes = nsamples * channels * bytes_per_sample + WAV_HEADER_LEN;
  else if (profile == XCODE_MP3)
    bytes = (uint64_t)len_ms * (uint64_t)bit_rate / 8000;
  else if (profile == XCODE_MP4_ALAC)
    bytes = nsamples * channels * bytes_per_sample / 2; // FIXME

  return bytes;
}


/*
 * Checks if this stream index is one that we are decoding
 *
 * @in ctx        Decode context
 * @in stream_index Index of stream to check
 * @return        Type of stream, unknown if we are not decoding the stream
 */
static enum AVMediaType
stream_find(struct decode_ctx *ctx, unsigned int stream_index)
{
  if (ctx->audio_stream.stream && (stream_index == ctx->audio_stream.stream->index))
    return AVMEDIA_TYPE_AUDIO;

  if (ctx->video_stream.stream && (stream_index == ctx->video_stream.stream->index))
    return AVMEDIA_TYPE_VIDEO;

  return AVMEDIA_TYPE_UNKNOWN;
}

/*
 * Adds a stream to an output
 *
 * @out ctx       A pre-allocated stream ctx where we save stream and codec info
 * @in output     Output to add the stream to
 * @in codec_id   What kind of codec should we use
 * @return        Negative on failure, otherwise zero
 */
static int
stream_add(struct encode_ctx *ctx, struct stream_ctx *s, enum AVCodecID codec_id)
{
  const AVCodecDescriptor *codec_desc;
#if USE_CONST_AVCODEC
  const AVCodec *encoder;
#else
  // Not const before ffmpeg 5.0
  AVCodec *encoder;
#endif
  AVDictionary *options = NULL;
  int ret;

  codec_desc = avcodec_descriptor_get(codec_id);
  if (!codec_desc)
    {
      DPRINTF(E_LOG, L_XCODE, "Invalid codec ID (%d)\n", codec_id);
      return -1;
    }

  encoder = avcodec_find_encoder(codec_id);
  if (!encoder)
    {
      DPRINTF(E_LOG, L_XCODE, "Necessary encoder (%s) not found\n", codec_desc->name);
      return -1;
    }

  DPRINTF(E_DBG, L_XCODE, "Selected encoder '%s'\n", encoder->long_name);

  CHECK_NULL(L_XCODE, s->stream = avformat_new_stream(ctx->ofmt_ctx, NULL));
  CHECK_NULL(L_XCODE, s->codec = avcodec_alloc_context3(encoder));

  stream_settings_set(s, &ctx->settings, encoder->type);

  if (!s->codec->pix_fmt)
    {
      s->codec->pix_fmt = avcodec_default_get_format(s->codec, encoder->pix_fmts);
      DPRINTF(E_DBG, L_XCODE, "Pixel format set to %s (encoder is %s)\n", av_get_pix_fmt_name(s->codec->pix_fmt), codec_desc->name);
    }

  if (ctx->ofmt_ctx->oformat->flags & AVFMT_GLOBALHEADER)
    s->codec->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;

  // With ffmpeg 3.4, jpeg encoding with optimal huffman tables will segfault, see issue #502
  if (codec_id == AV_CODEC_ID_MJPEG)
    av_dict_set(&options, "huffman", "default", 0);

  // 20 ms frames is the current ffmpeg default, but we set it anyway, so that
  // we don't risk issues if future versions change the default (it would become
  // an issue because outputs/cast.c relies on 20 ms frames)
  if (codec_id == AV_CODEC_ID_OPUS)
    av_dict_set(&options, "frame_duration", "20", 0);

  ret = avcodec_open2(s->codec, NULL, &options);
  if (ret < 0)
    {
      DPRINTF(E_LOG, L_XCODE, "Cannot open encoder (%s): %s\n", codec_desc->name, err2str(ret));
      goto error;
    }

  // airplay.c "misuses" the ffmpeg alac encoder in that it pushes frames with
  // 352 samples even though the encoder wants 4096 (and doesn't have variable
  // frame capability). This worked with no issues until ffmpeg 6, where it
  // seems a frame size check was added. The below circumvents the check, but is
  // dirty because we shouldn't be writing to this data element.
  if (ctx->settings.frame_size)
    s->codec->frame_size = ctx->settings.frame_size;

  // Copy the codec parameters we just set to the stream, so the muxer knows them
  ret = avcodec_parameters_from_context(s->stream->codecpar, s->codec);
  if (ret < 0)
    {
      DPRINTF(E_LOG, L_XCODE, "Cannot copy stream parameters (%s): %s\n", codec_desc->name, err2str(ret));
      goto error;
    }

  if (options)
    {
      DPRINTF(E_WARN, L_XCODE, "Encoder %s didn't recognize all options given to avcodec_open2\n", codec_desc->name);
      av_dict_free(&options);
    }

  return 0;

 error:
  if (s->codec)
    avcodec_free_context(&s->codec);
  if (options)
    av_dict_free(&options);

  return -1;
}

/*
 * Called by libavformat while demuxing. Used to interrupt/unblock av_read_frame
 * in case a source (especially a network stream) becomes unavailable.
 *
 * @in arg        Will point to the decode context
 * @return        Non-zero if av_read_frame should be interrupted
 */
static int
decode_interrupt_cb(void *arg)
{
  struct decode_ctx *ctx;

  ctx = (struct decode_ctx *)arg;

  if (av_gettime() - ctx->timestamp > READ_TIMEOUT)
    {
      DPRINTF(E_LOG, L_XCODE, "Timeout while reading source (connection problem?)\n");

      return 1;
    }

  return 0;
}

/* Will read the next packet from the source, unless we are resuming after a
 * seek in which case the most recent packet found by transcode_seek() will be
 * returned. The packet will be put in ctx->packet.
 *
 * @out type      Media type of packet
 * @in  ctx       Decode context
 * @return        0 if OK, < 0 on error or end of file
 */
static int
read_packet(enum AVMediaType *type, struct decode_ctx *dec_ctx)
{
  int ret;

  // We just seeked, so transcode_seek() will have found a new ctx->packet and
  // we should just use start with that (if the stream is one are ok with)
  if (dec_ctx->resume)
    {
      dec_ctx->resume = 0;
      *type = stream_find(dec_ctx, dec_ctx->packet->stream_index);
      if (*type != AVMEDIA_TYPE_UNKNOWN)
	return 0;
    }

  do
    {
      dec_ctx->timestamp = av_gettime();

      av_packet_unref(dec_ctx->packet);
      ret = av_read_frame(dec_ctx->ifmt_ctx, dec_ctx->packet);
      if (ret < 0)
	{
	  DPRINTF(E_WARN, L_XCODE, "Could not read frame: %s\n", err2str(ret));
	  return ret;
	}

      *type = stream_find(dec_ctx, dec_ctx->packet->stream_index);
    }
  while (*type == AVMEDIA_TYPE_UNKNOWN);

  return 0;
}

// Prepares a packet from the encoder for muxing
static void
packet_prepare(AVPacket *pkt, struct stream_ctx *s)
{
  pkt->stream_index = s->stream->index;

  // This "wonderful" peace of code makes sure that the timestamp always increases,
  // even if the user seeked backwards. The muxer will not accept non-increasing
  // timestamps.
  pkt->pts += s->offset_pts;
  if (pkt->pts < s->prev_pts)
    {
      s->offset_pts += s->prev_pts - pkt->pts;
      pkt->pts = s->prev_pts;
    }
  s->prev_pts = pkt->pts;
  pkt->dts = pkt->pts; //FIXME

  av_packet_rescale_ts(pkt, s->codec->time_base, s->stream->time_base);
}

/*
 * Part 4+5 of the conversion chain: read -> decode -> filter -> encode -> write
 *
 */
static int
encode_write(struct encode_ctx *ctx, struct stream_ctx *s, AVFrame *filt_frame)
{
  int ret;

  // If filt_frame is null then flushing will be initiated by the codec
  ret = avcodec_send_frame(s->codec, filt_frame);
  if (ret < 0)
    return ret;

  while (1)
    {
      ret = avcodec_receive_packet(s->codec, ctx->encoded_pkt);
      if (ret < 0)
	{
	  if (ret == AVERROR(EAGAIN))
	    ret = 0;

	  break;
	}

      packet_prepare(ctx->encoded_pkt, s);

      ret = av_interleaved_write_frame(ctx->ofmt_ctx, ctx->encoded_pkt);
      if (ret < 0)
        {
	  DPRINTF(E_WARN, L_XCODE, "av_interleaved_write_frame() failed: %s\n", err2str(ret));
	  break;
        }
    }

  return ret;
}

/*
 * Part 3 of the conversion chain: read -> decode -> filter -> encode -> write
 *
 * transcode_encode() starts here since the caller already has a frame
 *
 */
static int
filter_encode_write(struct encode_ctx *ctx, struct stream_ctx *s, AVFrame *frame)
{
  int ret;

  // Push the decoded frame into the filtergraph
  if (frame)
    {
      ret = av_buffersrc_add_frame(s->buffersrc_ctx, frame);
      if (ret < 0)
	{
	  DPRINTF(E_LOG, L_XCODE, "Error while feeding the filtergraph: %s\n", err2str(ret));
	  return -1;
	}
    }

  // Pull filtered frames from the filtergraph and pass to encoder
  while (1)
    {
      ret = av_buffersink_get_frame(s->buffersink_ctx, ctx->filt_frame);
      if (ret < 0)
	{
	  if (!frame) // We are flushing
	    ret = encode_write(ctx, s, NULL);
	  else if (ret == AVERROR(EAGAIN))
	    ret = 0;

	  break;
	}

      ret = encode_write(ctx, s, ctx->filt_frame);
      av_frame_unref(ctx->filt_frame);
      if (ret < 0)
	break;
    }

  return ret;
}

/*
 * Part 2 of the conversion chain: read -> decode -> filter -> encode -> write
 *
 * If there is no encode_ctx the chain will aborted here
 *
 */
static int
decode_filter_encode_write(struct transcode_ctx *ctx, struct stream_ctx *s, AVPacket *pkt, enum AVMediaType type)
{
  struct decode_ctx *dec_ctx = ctx->decode_ctx;
  struct stream_ctx *out_stream = NULL;
  int ret;

  ret = avcodec_send_packet(s->codec, pkt);
  if (ret < 0 && (ret != AVERROR_INVALIDDATA) && (ret != AVERROR(EAGAIN))) // We don't bail on invalid data, some streams work anyway
    {
      DPRINTF(E_LOG, L_XCODE, "Decoder error, avcodec_send_packet said '%s' (%d)\n", err2str(ret), ret);
      return ret;
    }

  if (ctx->encode_ctx)
    {
      if (type == AVMEDIA_TYPE_AUDIO)
	out_stream = &ctx->encode_ctx->audio_stream;
      else if (type == AVMEDIA_TYPE_VIDEO)
	out_stream = &ctx->encode_ctx->video_stream;
      else
	return -1;
    }

  while (1)
    {
      ret = avcodec_receive_frame(s->codec, dec_ctx->decoded_frame);
      if (ret < 0)
	{
	  if (ret == AVERROR(EAGAIN))
	    ret = 0;
	  else if (out_stream)
	    ret = filter_encode_write(ctx->encode_ctx, out_stream, NULL); // Flush

	  break;
	}

      dec_ctx->got_frame = 1;

      if (!out_stream)
	break;

      ret = filter_encode_write(ctx->encode_ctx, out_stream, dec_ctx->decoded_frame);
      if (ret < 0)
	break;
    }

  return ret;
}

/*
 * Part 1 of the conversion chain: read -> decode -> filter -> encode -> write
 *
 * Will read exactly one packet from the input and put it in the chain. You
 * cannot count on anything coming out of the other end from just one packet,
 * so you probably should loop when calling this and check the contents of
 * enc_ctx->obuf.
 *
 */
static int
read_decode_filter_encode_write(struct transcode_ctx *ctx)
{
  struct decode_ctx *dec_ctx = ctx->decode_ctx;
  struct encode_ctx *enc_ctx = ctx->encode_ctx;
  enum AVMediaType type;
  int ret;

  ret = read_packet(&type, dec_ctx);
  if (ret < 0)
    {
      if (ret == AVERROR_EOF)
	dec_ctx->eof = 1;

      if (dec_ctx->audio_stream.stream)
	decode_filter_encode_write(ctx, &dec_ctx->audio_stream, NULL, AVMEDIA_TYPE_AUDIO);
      if (dec_ctx->video_stream.stream)
	decode_filter_encode_write(ctx, &dec_ctx->video_stream, NULL, AVMEDIA_TYPE_VIDEO);

      if (enc_ctx)
	av_interleaved_write_frame(enc_ctx->ofmt_ctx, NULL); // Flush muxer
      if (enc_ctx && !enc_ctx->settings.without_libav_trailer)
	av_write_trailer(enc_ctx->ofmt_ctx);

      return ret;
    }

  if (type == AVMEDIA_TYPE_AUDIO)
    ret = decode_filter_encode_write(ctx, &dec_ctx->audio_stream, dec_ctx->packet, type);
  else if (type == AVMEDIA_TYPE_VIDEO)
    ret = decode_filter_encode_write(ctx, &dec_ctx->video_stream, dec_ctx->packet, type);

  return ret;
}

/* ------------------------------- CUSTOM I/O ------------------------------ */
/*      For using ffmpeg with evbuffer input/output instead of files         */

static int
avio_evbuffer_read(void *opaque, uint8_t *buf, int size)
{
  struct avio_evbuffer *ae = (struct avio_evbuffer *)opaque;
  int ret;

  ret = evbuffer_remove(ae->evbuf, buf, size);

  // Must return AVERROR, see avio.h: avio_alloc_context()
  return (ret > 0) ? ret : AVERROR_EOF;
}

#if USE_CONST_AVIO_WRITE_PACKET
static int
avio_evbuffer_write(void *opaque, const uint8_t *buf, int size)
#else
static int
avio_evbuffer_write(void *opaque, uint8_t *buf, int size)
#endif
{
  struct avio_evbuffer *ae = (struct avio_evbuffer *)opaque;
  int ret;

  ret = evbuffer_add(ae->evbuf, buf, size);

  return (ret == 0) ? size : -1;
}

static int64_t
avio_evbuffer_seek(void *opaque, int64_t offset, int whence)
{
  struct avio_evbuffer *ae = (struct avio_evbuffer *)opaque;
  enum transcode_seek_type seek_type;

  // Caller shouldn't need to know about ffmpeg defines
  if (whence & AVSEEK_SIZE)
    seek_type = XCODE_SEEK_SIZE;
  else if (whence == SEEK_SET)
    seek_type = XCODE_SEEK_SET;
  else if (whence == SEEK_CUR)
    seek_type = XCODE_SEEK_CUR;
  else
    return -1;

  return ae->seekfn(ae->seekfn_arg, offset, seek_type);
}

static AVIOContext *
avio_evbuffer_open(struct transcode_evbuf_io *evbuf_io, int is_output)
{
  struct avio_evbuffer *ae;
  AVIOContext *s;

  ae = calloc(1, sizeof(struct avio_evbuffer));
  if (!ae)
    {
      DPRINTF(E_LOG, L_FFMPEG, "Out of memory for avio_evbuffer\n");

      return NULL;
    }

  ae->buffer = av_mallocz(AVIO_BUFFER_SIZE);
  if (!ae->buffer)
    {
      DPRINTF(E_LOG, L_FFMPEG, "Out of memory for avio buffer\n");

      free(ae);
      return NULL;
    }

  ae->evbuf = evbuf_io->evbuf;
  ae->seekfn = evbuf_io->seekfn;
  ae->seekfn_arg = evbuf_io->seekfn_arg;

  if (is_output)
    s = avio_alloc_context(ae->buffer, AVIO_BUFFER_SIZE, 1, ae, NULL, avio_evbuffer_write, (evbuf_io->seekfn ? avio_evbuffer_seek : NULL));
  else
    s = avio_alloc_context(ae->buffer, AVIO_BUFFER_SIZE, 0, ae, avio_evbuffer_read, NULL, (evbuf_io->seekfn ? avio_evbuffer_seek : NULL));

  if (!s)
    {
      DPRINTF(E_LOG, L_FFMPEG, "Could not allocate AVIOContext\n");

      av_free(ae->buffer);
      free(ae);
      return NULL;
    }

  s->seekable = (evbuf_io->seekfn ? AVIO_SEEKABLE_NORMAL : 0);

  return s;
}

static void
avio_evbuffer_close(AVIOContext *s)
{
  struct avio_evbuffer *ae;

  if (!s)
    return;

  ae = (struct avio_evbuffer *)s->opaque;

  avio_flush(s);

  av_free(s->buffer);
  free(ae);

  av_free(s);
}


/* ----------------------- CUSTOM HEADER GENERATION ------------------------ */

static int
make_wav_header(struct evbuffer **wav_header, uint32_t sample_rate, uint16_t bytes_per_sample, uint16_t channels, uint32_t bytes_total)
{
  uint8_t header[WAV_HEADER_LEN];

  memcpy(header, "RIFF", 4);
  add_le32(header + 4, bytes_total - 8); // Total file size - 8 bytes as defined by the format
  memcpy(header + 8, "WAVEfmt ", 8);
  add_le32(header + 16, 16);
  add_le16(header + 20, 1); // AudioFormat (PCM)
  add_le16(header + 22, channels);     /* channels */
  add_le32(header + 24, sample_rate);  /* samplerate */
  add_le32(header + 28, sample_rate * channels * bytes_per_sample); /* byte rate */
  add_le16(header + 32, channels * bytes_per_sample);               /* block align */
  add_le16(header + 34, 8 * bytes_per_sample);                      /* bits per sample */
  memcpy(header + 36, "data", 4);
  add_le32(header + 40, bytes_total - WAV_HEADER_LEN);

  *wav_header = evbuffer_new();
  evbuffer_add(*wav_header, header, sizeof(header));
  return 0;
}

static int
mp4_adjust_moov_stco_offset(uint8_t *moov, size_t moov_len)
{
  uint8_t stco_needle[8] = { 's', 't', 'c', 'o', 0, 0, 0, 0 };
  uint32_t be32;
  uint32_t n_entries;
  uint32_t entry;
  uint8_t *ptr;
  uint8_t *end;

  end = moov + moov_len;
  ptr = memmem(moov, moov_len, stco_needle, sizeof(stco_needle));
  if (!ptr || ptr + sizeof(stco_needle) + sizeof(be32) > end)
    return -1;

  ptr += sizeof(stco_needle);
  memcpy(&be32, ptr, sizeof(be32));
  for (n_entries = be32toh(be32); n_entries > 0; n_entries--)
    {
      ptr += sizeof(be32);
      if (ptr + sizeof(be32) > end)
	return -1;

      memcpy(&be32, ptr, sizeof(be32));
      entry = be32toh(be32);
      be32 = htobe32(entry + moov_len);
      memcpy(ptr, &be32, sizeof(be32));
    }

  return 0;
}

static int
mp4_header_trailer_from_evbuf(uint8_t **header, size_t *header_len, uint8_t **trailer, size_t *trailer_len, struct evbuffer *evbuf, int64_t start_pos)
{
  uint8_t *buf = evbuffer_pullup(evbuf, -1);
  size_t buf_len = evbuffer_get_length(evbuf);
  int64_t pos = start_pos;
  int bytes_added = 0;
  uint8_t *marker;
  size_t len;
  int ret;

  while (buf_len > 0)
    {
      marker = memmem(buf, buf_len, xcode_seek_marker, sizeof(xcode_seek_marker));
      len = marker ? marker - buf : buf_len;

      if (pos <= *header_len) // Either first write of header or seek to pos inside header
	ret = copy_buffer_to_position(header, header_len, buf, len, pos);
      else if (pos >= start_pos) // Either first write of trailer or seek to pos inside trailer
	ret = copy_buffer_to_position(trailer, trailer_len, buf, len, pos - start_pos);
      else // Unexpected seek to body (pos is before trailer but not in header)
	ret = -1;

      if (ret < 0)
	return -1;

      bytes_added += ret;
      if (!marker)
	break;

      memcpy(&pos, marker + sizeof(xcode_seek_marker), sizeof(pos));
      buf += len + sizeof(xcode_seek_marker) + sizeof(pos);
      buf_len -= len + sizeof(xcode_seek_marker) + sizeof(pos);
  }

  evbuffer_drain(evbuf, -1);
  return bytes_added;
}

// Transcodes the entire file so that we can grab the header, which will then
// have a correct moov atom. The moov atom contains elements like stco and stsz
// which can only be made when the encoding has been done, since they contain
// information about where the frames are in the file. iTunes and Soundsbrdige
// requires these to be correct, otherwise they won't play our transcoded files.
// They also require that the atom is in the beginning of the file. ffmpeg's
// "faststart" option does this, but is difficult to use with non-file output,
// instead we move the atom ourselves.
static int
make_mp4_header(struct evbuffer **mp4_header, const char *url)
{
  struct transcode_decode_setup_args decode_args = { .profile = XCODE_MP4_ALAC_HEADER };
  struct transcode_encode_setup_args encode_args = { .profile = XCODE_MP4_ALAC_HEADER };
  struct transcode_ctx ctx = { 0 };
  struct transcode_evbuf_io evbuf_io = { 0 };
  uint8_t free_tag[4] = { 'f', 'r', 'e', 'e' };
  uint8_t *header = NULL;
  uint8_t *trailer = NULL;
  size_t header_len = 0;
  size_t trailer_len = 0;
  uint8_t *ptr;
  int ret;

  if (!url || *url != '/')
    return -1;

  CHECK_NULL(L_XCODE, evbuf_io.evbuf = evbuffer_new());

  evbuf_io.seekfn = dummy_seek;
  evbuf_io.seekfn_arg = &ctx;

  decode_args.path = url;
  ctx.decode_ctx = transcode_decode_setup(decode_args);
  if (!ctx.decode_ctx)
    goto error;

  encode_args.evbuf_io = &evbuf_io;
  encode_args.src_ctx = ctx.decode_ctx;
  ctx.encode_ctx = transcode_encode_setup(encode_args);
  if (!ctx.encode_ctx)
    goto error;

  // Save the template header, which looks something like this (note that the
  // mdate size is still unknown, so just zeroes, and there is no moov):
  //
  //  0000  00 00 00 1c 66 74 79 70 69 73 6f 6d 00 00 02 00  ....ftypisom....
  //  0010  69 73 6f 6d 69 73 6f 32 6d 70 34 31 00 00 00 08  isomiso2mp41....
  //  0020  66 72 65 65 00 00 00 00 6d 64 61 74              free....mdat
  ret = avformat_write_header(ctx.encode_ctx->ofmt_ctx, NULL);
  if (ret < 0)
    goto error;

  // Writes the obuf to the header buffer, bytes_processed is 0
  ret = mp4_header_trailer_from_evbuf(&header, &header_len, &trailer, &trailer_len, ctx.encode_ctx->obuf, ctx.encode_ctx->bytes_processed);
  if (ret < 0)
    goto error;

  ctx.encode_ctx->bytes_processed += ret;

  // Encode but discard result, this is just so that ffmpeg can create the
  // missing header data.
  while (read_decode_filter_encode_write(&ctx) == 0)
    {
      ctx.encode_ctx->bytes_processed += evbuffer_get_length(ctx.encode_ctx->obuf);
      evbuffer_drain(ctx.encode_ctx->obuf, -1);
    }

  // Here, ffmpeg will seek back and write the size to the mdat atom and then
  // seek forward again to write the trailer. Since we can't actually seek, we
  // instead look for the markers that dummy_seek() inserted.
  av_write_trailer(ctx.encode_ctx->ofmt_ctx);
  ret = mp4_header_trailer_from_evbuf(&header, &header_len, &trailer, &trailer_len, ctx.encode_ctx->obuf, ctx.encode_ctx->bytes_processed);
  if (ret < 0 || !header || !trailer)
    goto error;

  // The trailer buffer should now contain the moov atom. We need to adjust the
  // chunk offset (stco) in it because we will move it to the beginning of the
  // file.
  ret = mp4_adjust_moov_stco_offset(trailer, trailer_len);
  if (ret < 0)
    goto error;

  // Now we want to move the trailer (which has the moov atom) into the header.
  // We insert it before the free atom, because that's what ffmpeg does when
  // the "faststart" option is set.
  CHECK_NULL(L_XCODE, header = realloc(header, header_len + trailer_len));

  ptr = memmem(header, header_len, free_tag, sizeof(free_tag));
  if (!ptr || ptr - header < sizeof(uint32_t))
    goto error;

  ptr -= sizeof(uint32_t);
  memmove(ptr + trailer_len, ptr, header + header_len - ptr);
  memcpy(ptr, trailer, trailer_len);
  header_len += trailer_len;

  *mp4_header = evbuffer_new();
  evbuffer_add(*mp4_header, header, header_len);

  free(header);
  free(trailer);
  transcode_decode_cleanup(&ctx.decode_ctx);
  transcode_encode_cleanup(&ctx.encode_ctx);
  evbuffer_free(evbuf_io.evbuf);
  return 0;

 error:
  if (header)
    DHEXDUMP(E_DBG, L_XCODE, header, header_len, "MP4 header\n");
  if (trailer)
    DHEXDUMP(E_DBG, L_XCODE, trailer, trailer_len, "MP4 trailer\n");

  free(header);
  free(trailer);
  transcode_decode_cleanup(&ctx.decode_ctx);
  transcode_encode_cleanup(&ctx.encode_ctx);
  evbuffer_free(evbuf_io.evbuf);
  return -1;
}


/* --------------------------- INPUT/OUTPUT INIT --------------------------- */

static int
open_decoder(AVCodecContext **dec_ctx, unsigned int *stream_index, struct decode_ctx *ctx, enum AVMediaType type)
{
#if USE_CONST_AVCODEC
  const AVCodec *decoder;
#else
  // Not const before ffmpeg 5.0
  AVCodec *decoder;
#endif
  int ret;

  ret = av_find_best_stream(ctx->ifmt_ctx, type, -1, -1, &decoder, 0);
  if (ret < 0)
    {
      if (!ctx->settings.silent)
	DPRINTF(E_LOG, L_XCODE, "Error finding best stream: %s\n", err2str(ret));
      return ret;
    }

  *stream_index = (unsigned int)ret;

  CHECK_NULL(L_XCODE, *dec_ctx = avcodec_alloc_context3(decoder));

  // Filter creation will need the sample rate and format that the decoder is
  // giving us - however sample rate of dec_ctx will be 0 if we don't prime it
  // with the streams codecpar data.
  ret = avcodec_parameters_to_context(*dec_ctx, ctx->ifmt_ctx->streams[*stream_index]->codecpar);
  if (ret < 0)
    {
      DPRINTF(E_LOG, L_XCODE, "Failed to copy codecpar for stream #%d: %s\n", *stream_index, err2str(ret));
      avcodec_free_context(dec_ctx);
      return ret;
    }

  ret = avcodec_open2(*dec_ctx, NULL, NULL);
  if (ret < 0)
    {
      DPRINTF(E_LOG, L_XCODE, "Failed to open decoder for stream #%d: %s\n", *stream_index, err2str(ret));
      avcodec_free_context(dec_ctx);
      return ret;
    }

  return 0;
}

static void
close_input(struct decode_ctx *ctx)
{
  if (!ctx->ifmt_ctx)
    return;

  avio_evbuffer_close(ctx->avio);
  avcodec_free_context(&ctx->audio_stream.codec);
  avcodec_free_context(&ctx->video_stream.codec);
  avformat_close_input(&ctx->ifmt_ctx);
  ctx->ifmt_ctx = NULL;
}

static int
open_input(struct decode_ctx *ctx, const char *path, struct transcode_evbuf_io *evbuf_io, enum probe_type probe_type)
{
  AVDictionary *options = NULL;
  AVCodecContext *dec_ctx;
#if USE_CONST_AVFORMAT
  const AVInputFormat *ifmt;
#else
  // Not const before ffmpeg 5.0
  AVInputFormat *ifmt;
#endif
  unsigned int stream_index;
  const char *user_agent;
  int ret = 0;

  CHECK_NULL(L_XCODE, ctx->ifmt_ctx = avformat_alloc_context());

  // Caller can ask for small probe to start quicker + search for embedded
  // artwork quicker. Especially useful for http sources. The standard probe
  // size takes around 5 sec for an mp3, while the below only takes around a
  // second. The improved performance comes at the cost of possible inaccuracy.
  if (probe_type == PROBE_TYPE_QUICK)
    {
      ctx->ifmt_ctx->probesize = 65536;
      ctx->ifmt_ctx->format_probesize = 65536;
    }

  if (ctx->is_http)
    {
      av_dict_set(&options, "icy", "1", 0);

      user_agent = cfg_getstr(cfg_getsec(cfg, "general"), "user_agent");
      av_dict_set(&options, "user_agent", user_agent, 0);

      av_dict_set(&options, "reconnect", "1", 0);
      // The below option disabled because it does not work with m3u8 streams,
      // see https://lists.ffmpeg.org/pipermail/ffmpeg-user/2018-September/041109.html
//      av_dict_set(&options, "reconnect_at_eof", "1", 0);
      av_dict_set(&options, "reconnect_streamed", "1", 0);
    }

  // TODO Newest versions of ffmpeg have timeout and reconnect options we should use
  ctx->ifmt_ctx->interrupt_callback.callback = decode_interrupt_cb;
  ctx->ifmt_ctx->interrupt_callback.opaque = ctx;
  ctx->timestamp = av_gettime();

  if (evbuf_io)
    {
      ifmt = av_find_input_format(ctx->settings.in_format);
      if (!ifmt)
	{
	  DPRINTF(E_LOG, L_XCODE, "Could not find input format: '%s'\n", ctx->settings.in_format);
	  goto out_fail;
	}

      CHECK_NULL(L_XCODE, ctx->avio = avio_evbuffer_open(evbuf_io, 0));

      ctx->ifmt_ctx->pb = ctx->avio;
      ret = avformat_open_input(&ctx->ifmt_ctx, NULL, ifmt, &options);
    }
  else
    {
      ret = avformat_open_input(&ctx->ifmt_ctx, path, NULL, &options);
    }

  if (options)
    av_dict_free(&options);

  if (ret < 0)
    {
      DPRINTF(E_LOG, L_XCODE, "Cannot open '%s': %s\n", path, err2str(ret));
      goto out_fail;
    }

  // If the source has REPLAYGAIN_TRACK_GAIN metadata, this will inject the
  // values into the the next packet's side data (as AV_FRAME_DATA_REPLAYGAIN),
  // which has the effect that a volume replaygain filter works. Note that
  // ffmpeg itself uses another method in process_input() in ffmpeg.c.
  av_format_inject_global_side_data(ctx->ifmt_ctx);

  ret = avformat_find_stream_info(ctx->ifmt_ctx, NULL);
  if (ret < 0)
    {
      DPRINTF(E_LOG, L_XCODE, "Cannot find stream information: %s\n", err2str(ret));
      goto out_fail;
    }

  if (ctx->ifmt_ctx->nb_streams > MAX_STREAMS)
    {
      DPRINTF(E_LOG, L_XCODE, "File '%s' has too many streams (%u)\n", path, ctx->ifmt_ctx->nb_streams);
      goto out_fail;
    }

  if (ctx->settings.encode_audio)
    {
      ret = open_decoder(&dec_ctx, &stream_index, ctx, AVMEDIA_TYPE_AUDIO);
      if (ret < 0)
	goto out_fail;

      ctx->audio_stream.codec = dec_ctx;
      ctx->audio_stream.stream = ctx->ifmt_ctx->streams[stream_index];
    }

  if (ctx->settings.encode_video)
    {
      ret = open_decoder(&dec_ctx, &stream_index, ctx, AVMEDIA_TYPE_VIDEO);
      if (ret < 0)
	goto out_fail;

      ctx->video_stream.codec = dec_ctx;
      ctx->video_stream.stream = ctx->ifmt_ctx->streams[stream_index];
    }

  return 0;

 out_fail:
  close_input(ctx);
  return (ret < 0 ? ret : -1); // If we got an error code from ffmpeg then return that
}

static void
close_output(struct encode_ctx *ctx)
{
  if (!ctx->ofmt_ctx)
    return;

  avcodec_free_context(&ctx->audio_stream.codec);
  avcodec_free_context(&ctx->video_stream.codec);
  avio_evbuffer_close(ctx->ofmt_ctx->pb);
  avformat_free_context(ctx->ofmt_ctx);
  ctx->ofmt_ctx = NULL;
}

static int
open_output(struct encode_ctx *ctx, struct transcode_evbuf_io *evbuf_io, struct evbuffer *prepared_header, struct decode_ctx *src_ctx)
{
#if USE_CONST_AVFORMAT
  const AVOutputFormat *oformat;
#else
  // Not const before ffmpeg 5.0
  AVOutputFormat *oformat;
#endif
  AVDictionary *options = NULL;
  struct evbuffer *header = NULL;
  int ret;

  oformat = av_guess_format(ctx->settings.format, NULL, NULL);
  if (!oformat)
    {
      DPRINTF(E_LOG, L_XCODE, "ffmpeg/libav could not find the '%s' output format\n", ctx->settings.format);
      return -1;
    }

#if USE_NO_CLEAR_AVFMT_NOFILE
  CHECK_ERRNO(L_XCODE, avformat_alloc_output_context2(&ctx->ofmt_ctx, oformat, NULL, NULL));
#else
  // Clear AVFMT_NOFILE bit, it is not allowed as we will set our own AVIOContext.
  // If this is not done with e.g. ffmpeg 3.4 then artwork rescaling will fail.
  oformat->flags &= ~AVFMT_NOFILE;

  CHECK_NULL(L_XCODE, ctx->ofmt_ctx = avformat_alloc_context());

  ctx->ofmt_ctx->oformat = oformat;
#endif

  CHECK_NULL(L_XCODE, ctx->ofmt_ctx->pb = avio_evbuffer_open(evbuf_io, 1));
  ctx->obuf = evbuf_io->evbuf;

  if (ctx->settings.encode_audio)
    {
      ret = stream_add(ctx, &ctx->audio_stream, ctx->settings.audio_codec);
      if (ret < 0)
	goto error;
    }

  if (ctx->settings.encode_video)
    {
      ret = stream_add(ctx, &ctx->video_stream, ctx->settings.video_codec);
      if (ret < 0)
	goto error;
    }

  ret = avformat_init_output(ctx->ofmt_ctx, &options);
  if (ret < 0)
    {
      DPRINTF(E_LOG, L_XCODE, "Error initializing output: %s\n", err2str(ret));
      goto error;
    }
  else if (options)
    {
      DPRINTF(E_WARN, L_XCODE, "Didn't recognize all options given to avformat_init_output\n");
      av_dict_free(&options);
      goto error;
    }

  // For WAV output, both avformat_write_header() and manual wav header is required
  if (!ctx->settings.without_libav_header)
    {
      ret = avformat_write_header(ctx->ofmt_ctx, NULL);
      if (ret < 0)
	{
	  DPRINTF(E_LOG, L_XCODE, "Error writing header to output buffer: %s\n", err2str(ret));
	  goto error;
	}
    }
  if (ctx->settings.with_wav_header)
    {
      ret = make_wav_header(&header, ctx->settings.sample_rate, av_get_bytes_per_sample(ctx->settings.sample_format), ctx->settings.nb_channels, ctx->bytes_total);
      if (ret < 0)
	{
	  DPRINTF(E_LOG, L_XCODE, "Error creating WAV header\n");
	  goto error;
	}

      evbuffer_add_buffer(ctx->obuf, header);
      evbuffer_free(header);
    }

  if (ctx->settings.with_mp4_header && prepared_header)
    {
      evbuffer_add_buffer(ctx->obuf, prepared_header);
    }
  else if (ctx->settings.with_mp4_header)
    {
      ret = make_mp4_header(&header, src_ctx->ifmt_ctx->url);
      if (ret < 0)
	{
	  DPRINTF(E_LOG, L_XCODE, "Error creating MP4 header\n");
	  goto error;
	}

      evbuffer_add_buffer(ctx->obuf, header);
      evbuffer_free(header);
    }

  return 0;

 error:
  close_output(ctx);
  return -1;
}

static int
filter_def_abuffer(struct filter_def *def, struct stream_ctx *out_stream, struct stream_ctx *in_stream, const char *deffn_arg)
{
#if USE_CH_LAYOUT
  char buf[64];

  // Some AIFF files only have a channel number, not a layout
  if (in_stream->codec->ch_layout.order == AV_CHANNEL_ORDER_UNSPEC)
    av_channel_layout_default(&in_stream->codec->ch_layout, in_stream->codec->ch_layout.nb_channels);

  av_channel_layout_describe(&in_stream->codec->ch_layout, buf, sizeof(buf));

  snprintf(def->args, sizeof(def->args),
           "time_base=%d/%d:sample_rate=%d:sample_fmt=%s:channel_layout=%s",
           in_stream->stream->time_base.num, in_stream->stream->time_base.den,
           in_stream->codec->sample_rate, av_get_sample_fmt_name(in_stream->codec->sample_fmt),
           buf);
#else
  if (!in_stream->codec->channel_layout)
    in_stream->codec->channel_layout = av_get_default_channel_layout(in_stream->codec->channels);

  snprintf(def->args, sizeof(def->args),
           "time_base=%d/%d:sample_rate=%d:sample_fmt=%s:channel_layout=0x%"PRIx64,
           in_stream->stream->time_base.num, in_stream->stream->time_base.den,
           in_stream->codec->sample_rate, av_get_sample_fmt_name(in_stream->codec->sample_fmt),
           in_stream->codec->channel_layout);
#endif
  snprintf(def->name, sizeof(def->name), "abuffer");
  return 0;
}

static int
filter_def_aformat(struct filter_def *def, struct stream_ctx *out_stream, struct stream_ctx *in_stream, const char *deffn_arg)
{
#if USE_CH_LAYOUT
  char buf[64];

  if (out_stream->codec->ch_layout.order == AV_CHANNEL_ORDER_UNSPEC)
    av_channel_layout_default(&out_stream->codec->ch_layout, out_stream->codec->ch_layout.nb_channels);

  av_channel_layout_describe(&out_stream->codec->ch_layout, buf, sizeof(buf));

  snprintf(def->args, sizeof(def->args),
           "sample_fmts=%s:sample_rates=%d:channel_layouts=%s",
           av_get_sample_fmt_name(out_stream->codec->sample_fmt), out_stream->codec->sample_rate,
           buf);
#else
  // For some AIFF files, ffmpeg (3.4.6) will not give us a channel_layout (bug in ffmpeg?)
  if (!out_stream->codec->channel_layout)
    out_stream->codec->channel_layout = av_get_default_channel_layout(out_stream->codec->channels);

  snprintf(def->args, sizeof(def->args),
           "sample_fmts=%s:sample_rates=%d:channel_layouts=0x%"PRIx64,
           av_get_sample_fmt_name(out_stream->codec->sample_fmt), out_stream->codec->sample_rate,
           out_stream->codec->channel_layout);
#endif
  snprintf(def->name, sizeof(def->name), "aformat");
  return 0;
}

static int
filter_def_abuffersink(struct filter_def *def, struct stream_ctx *out_stream, struct stream_ctx *in_stream, const char *deffn_arg)
{
  snprintf(def->name, sizeof(def->name), "abuffersink");
  *def->args = '\0';
  return 0;
}

static int
filter_def_buffer(struct filter_def *def, struct stream_ctx *out_stream, struct stream_ctx *in_stream, const char *deffn_arg)
{
  snprintf(def->name, sizeof(def->name), "buffer");
  snprintf(def->args, sizeof(def->args),
           "width=%d:height=%d:pix_fmt=%s:time_base=%d/%d:sar=%d/%d",
           in_stream->codec->width, in_stream->codec->height, av_get_pix_fmt_name(in_stream->codec->pix_fmt),
           in_stream->stream->time_base.num, in_stream->stream->time_base.den,
           in_stream->codec->sample_aspect_ratio.num, in_stream->codec->sample_aspect_ratio.den);
  return 0;
}

static int
filter_def_format(struct filter_def *def, struct stream_ctx *out_stream, struct stream_ctx *in_stream, const char *deffn_arg)
{
  snprintf(def->name, sizeof(def->name), "format");
  snprintf(def->args, sizeof(def->args),
           "pix_fmts=%s", av_get_pix_fmt_name(out_stream->codec->pix_fmt));
  return 0;
}

static int
filter_def_scale(struct filter_def *def, struct stream_ctx *out_stream, struct stream_ctx *in_stream, const char *deffn_arg)
{
  snprintf(def->name, sizeof(def->name), "scale");
  snprintf(def->args, sizeof(def->args),
           "w=%d:h=%d", out_stream->codec->width, out_stream->codec->height);
  return 0;
}

static int
filter_def_buffersink(struct filter_def *def, struct stream_ctx *out_stream, struct stream_ctx *in_stream, const char *deffn_arg)
{
  snprintf(def->name, sizeof(def->name), "buffersink");
  *def->args = '\0';
  return 0;
}

static int
filter_def_user(struct filter_def *def, struct stream_ctx *out_stream, struct stream_ctx *in_stream, const char *deffn_arg)
{
  char *ptr;

  snprintf(def->name, sizeof(def->name), "%s", deffn_arg);

  ptr = strchr(def->name, '=');
  if (ptr)
    {
      *ptr = '\0';
      snprintf(def->args, sizeof(def->args), "%s", ptr + 1);
    }
  else
    *def->args = '\0';

  return 0;
}

static int
define_audio_filters(struct filters *filters, size_t filters_len, bool with_user_filters)
{
  int num_user_filters;
  int i;

  num_user_filters = cfg_size(cfg_getsec(cfg, "library"), "decode_audio_filters");
  if (filters_len < num_user_filters + 3)
    {
      DPRINTF(E_LOG, L_XCODE, "Too many audio filters configured (%d, max is %zu)\n", num_user_filters, filters_len - 3);
      return -1;
    }

  filters[0].deffn = filter_def_abuffer;
  for (i = 0; with_user_filters && i < num_user_filters; i++)
    {
      filters[1 + i].deffn = filter_def_user;
      filters[1 + i].deffn_arg = cfg_getnstr(cfg_getsec(cfg, "library"), "decode_audio_filters", i);
    }
  filters[1 + i].deffn = filter_def_aformat;
  filters[2 + i].deffn = filter_def_abuffersink;

  return 0;
}

static int
define_video_filters(struct filters *filters, size_t filters_len, bool with_user_filters)
{
  int num_user_filters;
  int i;

  num_user_filters = cfg_size(cfg_getsec(cfg, "library"), "decode_video_filters");
  if (filters_len < num_user_filters + 3)
    {
      DPRINTF(E_LOG, L_XCODE, "Too many video filters configured (%d, max is %zu)\n", num_user_filters, filters_len - 3);
      return -1;
    }

  filters[0].deffn = filter_def_buffer;
  for (i = 0; with_user_filters && i < num_user_filters; i++)
    {
      filters[1 + i].deffn = filter_def_user;
      filters[1 + i].deffn_arg = cfg_getnstr(cfg_getsec(cfg, "library"), "decode_video_filters", i);
    }
  filters[1 + i].deffn = filter_def_format;
  filters[2 + i].deffn = filter_def_scale;
  filters[3 + i].deffn = filter_def_buffersink;

  return 0;
}

static int
add_filters(int *num_added, AVFilterGraph *filter_graph, struct filters *filters, size_t filters_len,
            struct stream_ctx *out_stream, struct stream_ctx *in_stream)
{
  const AVFilter *av_filter;
  struct filter_def def;
  int i;
  int ret;

  for (i = 0; i < filters_len && filters[i].deffn; i++)
    {
      ret = filters[i].deffn(&def, out_stream, in_stream, filters[i].deffn_arg);
      if (ret < 0)
	{
	  DPRINTF(E_LOG, L_XCODE, "Error creating filter definition\n");
	  return -1;
	}

      av_filter = avfilter_get_by_name(def.name);
      if (!av_filter)
	{
	  DPRINTF(E_LOG, L_XCODE, "Could not find filter '%s'\n", def.name);
	  return -1;
	}

      ret = avfilter_graph_create_filter(&filters[i].av_ctx, av_filter, def.name, def.args, NULL, filter_graph);
      if (ret < 0)
	{
	  DPRINTF(E_LOG, L_XCODE, "Error creating filter '%s': %s\n", def.name, err2str(ret));
	  return -1;
	}

      DPRINTF(E_DBG, L_XCODE, "Created '%s' filter: '%s'\n", def.name, def.args);

      if (i == 0)
	continue;

      ret = avfilter_link(filters[i - 1].av_ctx, 0, filters[i].av_ctx, 0);
      if (ret < 0)
	{
	  DPRINTF(E_LOG, L_XCODE, "Error connecting filters: %s\n", err2str(ret));
	  return -1;
	}
    }

  *num_added = i;
  return 0;
}

static int
create_filtergraph(struct stream_ctx *out_stream, struct filters *filters, size_t filters_len, struct stream_ctx *in_stream)
{
  AVFilterGraph *filter_graph;
  int ret;
  int added;

  CHECK_NULL(L_XCODE, filter_graph = avfilter_graph_alloc());

  ret = add_filters(&added, filter_graph, filters, filters_len, out_stream, in_stream);
  if (ret < 0)
    {
      goto out_fail;
    }

  ret = avfilter_graph_config(filter_graph, NULL);
  if (ret < 0)
    {
      DPRINTF(E_LOG, L_XCODE, "Filter graph config failed: %s\n", err2str(ret));
      goto out_fail;
    }

  out_stream->buffersrc_ctx = filters[0].av_ctx;
  out_stream->buffersink_ctx = filters[added - 1].av_ctx;
  out_stream->filter_graph = filter_graph;

  return 0;

 out_fail:
  avfilter_graph_free(&filter_graph);
  return -1;
}

static void
close_filters(struct encode_ctx *ctx)
{
  avfilter_graph_free(&ctx->audio_stream.filter_graph);
  avfilter_graph_free(&ctx->video_stream.filter_graph);
}

static int
open_filters(struct encode_ctx *ctx, struct decode_ctx *src_ctx)
{
  struct filters filters[MAX_FILTERS] = { 0 };
  int ret;

  if (ctx->settings.encode_audio)
    {
      ret = define_audio_filters(filters, ARRAY_SIZE(filters), ctx->settings.with_user_filters);
      if (ret < 0)
	goto out_fail;

      ret = create_filtergraph(&ctx->audio_stream, filters, ARRAY_SIZE(filters), &src_ctx->audio_stream);
      if (ret < 0)
	goto out_fail;

      // Many audio encoders require a fixed frame size. This will ensure that
      // the filt_frame from av_buffersink_get_frame has that size (except EOF).
      if (! (ctx->audio_stream.codec->codec->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE))
	av_buffersink_set_frame_size(ctx->audio_stream.buffersink_ctx, ctx->audio_stream.codec->frame_size);
    }

  if (ctx->settings.encode_video)
    {
      ret = define_video_filters(filters, ARRAY_SIZE(filters), ctx->settings.with_user_filters);
      if (ret < 0)
	goto out_fail;

      ret = create_filtergraph(&ctx->video_stream, filters, ARRAY_SIZE(filters), &src_ctx->video_stream);
      if (ret < 0)
	goto out_fail;
    }

  return 0;

 out_fail:
  close_filters(ctx);
  return -1;
}


/* ----------------------------- TRANSCODE API ----------------------------- */

/*                                  Setup                                    */

struct decode_ctx *
transcode_decode_setup(struct transcode_decode_setup_args args)
{
  struct decode_ctx *ctx;
  int ret;

  CHECK_NULL(L_XCODE, ctx = calloc(1, sizeof(struct decode_ctx)));
  CHECK_NULL(L_XCODE, ctx->decoded_frame = av_frame_alloc());
  CHECK_NULL(L_XCODE, ctx->packet = av_packet_alloc());

  ctx->len_ms = args.len_ms;

  ret = init_settings(&ctx->settings, args.profile, args.quality);
  if (ret < 0)
    goto fail_free;

  if (args.is_http)
    {
      ctx->is_http = true;

      ret = open_input(ctx, args.path, args.evbuf_io, PROBE_TYPE_QUICK);

      // Retry with a default, slower probe size
      if (ret == AVERROR_STREAM_NOT_FOUND)
	ret = open_input(ctx, args.path, args.evbuf_io, PROBE_TYPE_DEFAULT);
    }
  else
    ret = open_input(ctx, args.path, args.evbuf_io, PROBE_TYPE_DEFAULT);

  if (ret < 0)
    goto fail_free;

  return ctx;

 fail_free:
  av_packet_free(&ctx->packet);
  av_frame_free(&ctx->decoded_frame);
  free(ctx);
  return NULL;
}

struct encode_ctx *
transcode_encode_setup(struct transcode_encode_setup_args args)
{
  struct encode_ctx *ctx;
  int dst_bytes_per_sample;

  CHECK_NULL(L_XCODE, ctx = calloc(1, sizeof(struct encode_ctx)));
  CHECK_NULL(L_XCODE, ctx->filt_frame = av_frame_alloc());
  CHECK_NULL(L_XCODE, ctx->encoded_pkt = av_packet_alloc());
  CHECK_NULL(L_XCODE, ctx->evbuf_io.evbuf = evbuffer_new());

  // Caller didn't specify one, so use our own
  if (!args.evbuf_io)
    args.evbuf_io = &ctx->evbuf_io;

  // Initialize general settings
  if (init_settings(&ctx->settings, args.profile, args.quality) < 0)
    goto error;

  if (ctx->settings.encode_audio && init_settings_from_audio(&ctx->settings, args.profile, args.src_ctx, args.quality) < 0)
    goto error;

  if (ctx->settings.encode_video && init_settings_from_video(&ctx->settings, args.profile, args.src_ctx, args.width, args.height) < 0)
    goto error;

  dst_bytes_per_sample = av_get_bytes_per_sample(ctx->settings.sample_format);
  ctx->bytes_total = size_estimate(args.profile, ctx->settings.bit_rate, ctx->settings.sample_rate, dst_bytes_per_sample, ctx->settings.nb_channels, args.src_ctx->len_ms);

  if (ctx->settings.with_icy && args.src_ctx->is_http)
    ctx->icy_interval = METADATA_ICY_INTERVAL * ctx->settings.nb_channels * dst_bytes_per_sample * ctx->settings.sample_rate;

  if (open_output(ctx, args.evbuf_io, args.prepared_header, args.src_ctx) < 0)
    goto error;

  if (open_filters(ctx, args.src_ctx) < 0)
    goto error;

  return ctx;

 error:
  transcode_encode_cleanup(&ctx);
  return NULL;
}

struct transcode_ctx *
transcode_setup(struct transcode_decode_setup_args decode_args, struct transcode_encode_setup_args encode_args)
{
  struct transcode_ctx *ctx;

  CHECK_NULL(L_XCODE, ctx = calloc(1, sizeof(struct transcode_ctx)));

  ctx->decode_ctx = transcode_decode_setup(decode_args);
  if (!ctx->decode_ctx)
    {
      free(ctx);
      return NULL;
    }

  encode_args.src_ctx = ctx->decode_ctx;
  ctx->encode_ctx = transcode_encode_setup(encode_args);
  if (!ctx->encode_ctx)
    {
      transcode_decode_cleanup(&ctx->decode_ctx);
      free(ctx);
      return NULL;
    }

  return ctx;
}

struct decode_ctx *
transcode_decode_setup_raw(enum transcode_profile profile, struct media_quality *quality)
{
  const AVCodecDescriptor *codec_desc;
  struct decode_ctx *ctx;
#if USE_CONST_AVCODEC
  const AVCodec *decoder;
#else
  // Not const before ffmpeg 5.0
  AVCodec *decoder;
#endif
  int ret;

  CHECK_NULL(L_XCODE, ctx = calloc(1, sizeof(struct decode_ctx)));

  if (init_settings(&ctx->settings, profile, quality) < 0)
    {
      goto out_free_ctx;
    }

  codec_desc = avcodec_descriptor_get(ctx->settings.audio_codec);
  if (!codec_desc)
    {
      DPRINTF(E_LOG, L_XCODE, "Invalid codec ID (%d)\n", ctx->settings.audio_codec);
      goto out_free_ctx;
    }

  // In raw mode we won't actually need to read or decode, but we still setup
  // the decode_ctx because transcode_encode_setup() gets info about the input
  // through this structure (TODO dont' do that)
  decoder = avcodec_find_decoder(ctx->settings.audio_codec);
  if (!decoder)
    {
      DPRINTF(E_LOG, L_XCODE, "Could not find decoder for: %s\n", codec_desc->name);
      goto out_free_ctx;
    }

  CHECK_NULL(L_XCODE, ctx->ifmt_ctx = avformat_alloc_context());
  CHECK_NULL(L_XCODE, ctx->audio_stream.codec = avcodec_alloc_context3(decoder));
  CHECK_NULL(L_XCODE, ctx->audio_stream.stream = avformat_new_stream(ctx->ifmt_ctx, NULL));

  stream_settings_set(&ctx->audio_stream, &ctx->settings, decoder->type);

  // Copy the data we just set to the structs we will be querying later, e.g. in open_filter
  ctx->audio_stream.stream->time_base = ctx->audio_stream.codec->time_base;
  ret = avcodec_parameters_from_context(ctx->audio_stream.stream->codecpar, ctx->audio_stream.codec);
  if (ret < 0)
    {
      DPRINTF(E_LOG, L_XCODE, "Cannot copy stream parameters (%s): %s\n", codec_desc->name, err2str(ret));
      goto out_free_codec;
    }

  return ctx;

 out_free_codec:
  avcodec_free_context(&ctx->audio_stream.codec);
  avformat_free_context(ctx->ifmt_ctx);
 out_free_ctx:
  free(ctx);
  return NULL;
}

enum transcode_profile
transcode_needed(const char *user_agent, const char *client_codecs, const char *file_codectype)
{
  const char *codectype;
  const char *prefer_format;
  cfg_t *lib;
  bool force_xcode;
  bool supports_alac;
  bool supports_mpeg;
  bool supports_wav;
  int count;
  int i;

  if (!file_codectype)
    {
      return XCODE_UNKNOWN;
    }

  lib = cfg_getsec(cfg, "library");

  count = cfg_size(lib, "no_decode");
  for (i = 0; i < count; i++)
    {
      codectype = cfg_getnstr(lib, "no_decode", i);
      if (strcmp(file_codectype, codectype) == 0)
	return XCODE_NONE; // Codectype is in no_decode
    }

  count = cfg_size(lib, "force_decode");
  for (i = 0, force_xcode = false; i < count && !force_xcode; i++)
    {
      codectype = cfg_getnstr(lib, "force_decode", i);
      if (strcmp(file_codectype, codectype) == 0)
	force_xcode = true; // Codectype is in force_decode
    }

  if (!client_codecs && user_agent)
    {
      if (strncmp(user_agent, "iTunes", strlen("iTunes")) == 0)
	client_codecs = itunes_codecs;
      else if (strncmp(user_agent, "Music/", strlen("Music/")) == 0) // Apple Music, include slash because the name is generic
	client_codecs = itunes_codecs;
      else if (strncmp(user_agent, "QuickTime", strlen("QuickTime")) == 0)
	client_codecs = itunes_codecs; // Use iTunes codecs
      else if (strncmp(user_agent, "Front%20Row", strlen("Front%20Row")) == 0)
	client_codecs = itunes_codecs; // Use iTunes codecs
      else if (strncmp(user_agent, "AppleCoreMedia", strlen("AppleCoreMedia")) == 0)
	client_codecs = itunes_codecs; // Use iTunes codecs
      else if (strncmp(user_agent, "Roku", strlen("Roku")) == 0)
	client_codecs = roku_codecs;
      else if (strncmp(user_agent, "Hifidelio", strlen("Hifidelio")) == 0)
	/* Allegedly can't transcode for Hifidelio because their
	 * HTTP implementation doesn't honour Connection: close.
	 * At least, that's why mt-daapd didn't do it.
	 */
	return XCODE_NONE;
    }

  if (!client_codecs)
    client_codecs = default_codecs;
  else
    DPRINTF(E_SPAM, L_XCODE, "Client advertises codecs: %s\n", client_codecs);

  if (!force_xcode && strstr(client_codecs, file_codectype))
    return XCODE_NONE;

  supports_alac = strstr(client_codecs, "alac") || strstr(client_codecs, "mp4a");
  supports_mpeg = strstr(client_codecs, "mpeg") && avcodec_find_encoder(AV_CODEC_ID_MP3);
  supports_wav = strstr(client_codecs, "wav");

  prefer_format = cfg_getstr(lib, "prefer_format");
  if (prefer_format)
    {
      if (strcmp(prefer_format, "wav") == 0 && supports_wav)
	return XCODE_WAV;
      if (strcmp(prefer_format, "mpeg") == 0 && supports_mpeg)
	return XCODE_MP3;
      if (strcmp(prefer_format, "alac") == 0 && supports_alac)
	return XCODE_MP4_ALAC;
    }

  // This order determines the default if user didn't configure a preference.
  // The lossless formats are given highest preference.
  if (supports_wav)
    return XCODE_WAV;
  if (supports_mpeg)
    return XCODE_MP3;
  if (supports_alac)
    return XCODE_MP4_ALAC;

  return XCODE_UNKNOWN;
}


/*                                  Cleanup                                  */

void
transcode_decode_cleanup(struct decode_ctx **ctx)
{
  if (!(*ctx))
    return;

  close_input(*ctx);

  av_packet_free(&(*ctx)->packet);
  av_frame_free(&(*ctx)->decoded_frame);
  free(*ctx);
  *ctx = NULL;
}

void
transcode_encode_cleanup(struct encode_ctx **ctx)
{
  if (!*ctx)
    return;

  close_filters(*ctx);
  close_output(*ctx);

  evbuffer_free((*ctx)->evbuf_io.evbuf);
  av_packet_free(&(*ctx)->encoded_pkt);
  av_frame_free(&(*ctx)->filt_frame);
  free(*ctx);
  *ctx = NULL;
}

void
transcode_cleanup(struct transcode_ctx **ctx)
{
  if (!*ctx)
    return;

  transcode_encode_cleanup(&(*ctx)->encode_ctx);
  transcode_decode_cleanup(&(*ctx)->decode_ctx);
  free(*ctx);
  *ctx = NULL;
}


/*                       Encoding, decoding and transcoding                  */

int
transcode_decode(transcode_frame **frame, struct decode_ctx *dec_ctx)
{
  struct transcode_ctx ctx;
  int ret;

  if (dec_ctx->got_frame)
    DPRINTF(E_LOG, L_XCODE, "Bug! Currently no support for multiple calls to transcode_decode()\n");

  ctx.decode_ctx = dec_ctx;
  ctx.encode_ctx = NULL;

  do
    {
      // This function stops after decoding because ctx->encode_ctx is NULL
      ret = read_decode_filter_encode_write(&ctx);
    }
  while ((ret == 0) && (!dec_ctx->got_frame));

  if (ret < 0)
    return -1;

  *frame = dec_ctx->decoded_frame;

  if (dec_ctx->eof)
    return 0;

  return 1;
}

// Filters and encodes
int
transcode_encode(struct evbuffer *evbuf, struct encode_ctx *ctx, transcode_frame *frame, int eof)
{
  AVFrame *f = frame;
  struct stream_ctx *s;
  size_t start_length;
  int ret;

  start_length = evbuffer_get_length(ctx->obuf);

  // Really crappy way of detecting if frame is audio, video or something else
#if USE_CH_LAYOUT
  if (f->ch_layout.nb_channels && f->sample_rate)
#else
  if (f->channel_layout && f->sample_rate)
#endif
    s = &ctx->audio_stream;
  else if (f->width && f->height)
    s = &ctx->video_stream;
  else
    {
      DPRINTF(E_LOG, L_XCODE, "Bug! Encoder could not detect frame type\n");
      return -1;
    }

  ret = filter_encode_write(ctx, s, f);
  if (ret < 0)
    {
      DPRINTF(E_LOG, L_XCODE, "Error occurred while encoding: %s\n", err2str(ret));
      return ret;
    }

  // Flush
  if (eof)
    {
      filter_encode_write(ctx, s, NULL);
      av_write_trailer(ctx->ofmt_ctx);
    }

  ret = evbuffer_get_length(ctx->obuf) - start_length;

  evbuffer_add_buffer(evbuf, ctx->obuf);

  return ret;
}

int
transcode(struct evbuffer *evbuf, int *icy_timer, struct transcode_ctx *ctx, int want_bytes)
{
  size_t start_length;
  int processed = 0;
  int ret;

  if (icy_timer)
    *icy_timer = 0;

  if (ctx->decode_ctx->eof)
    return 0;

  start_length = evbuffer_get_length(ctx->encode_ctx->obuf);

  do
    {
      ret = read_decode_filter_encode_write(ctx);
      processed = evbuffer_get_length(ctx->encode_ctx->obuf) - start_length;
    }
  while ((ret == 0) && (!want_bytes || (processed < want_bytes)));

  evbuffer_add_buffer(evbuf, ctx->encode_ctx->obuf);

  ctx->encode_ctx->bytes_processed += processed;
  if (icy_timer && ctx->encode_ctx->icy_interval)
    *icy_timer = (ctx->encode_ctx->bytes_processed % ctx->encode_ctx->icy_interval < processed);

  if ((ret < 0) && (ret != AVERROR_EOF))
    return ret;

  return processed;
}

transcode_frame *
transcode_frame_new(void *data, size_t size, int nsamples, struct media_quality *quality)
{
  AVFrame *f;
  int ret;

  f = av_frame_alloc();
  if (!f)
    {
      DPRINTF(E_LOG, L_XCODE, "Out of memory for frame\n");
      return NULL;
    }

  f->format = bitdepth2format(quality->bits_per_sample);
  if (f->format == AV_SAMPLE_FMT_NONE)
    {
      DPRINTF(E_LOG, L_XCODE, "transcode_frame_new() called with unsupported bps (%d)\n", quality->bits_per_sample);
      av_frame_free(&f);
      return NULL;
    }

  f->sample_rate    = quality->sample_rate;
  f->nb_samples     = nsamples;
#if USE_CH_LAYOUT
  av_channel_layout_default(&f->ch_layout, quality->channels);
#else
  f->channel_layout = av_get_default_channel_layout(quality->channels);
# ifdef HAVE_FFMPEG
  f->channels       = quality->channels;
# endif
#endif
  f->pts            = AV_NOPTS_VALUE;

  // We don't align because the frame won't be given directly to the encoder
  // anyway, it will first go through the filter (which might align it...?)
  ret = avcodec_fill_audio_frame(f, quality->channels, f->format, data, size, 1);
  if (ret < 0)
    {
      DPRINTF(E_LOG, L_XCODE, "Error filling frame with rawbuf, size %zu, samples %d (%d/%d/%d): %s\n",
	size, nsamples, quality->sample_rate, quality->bits_per_sample, quality->channels, err2str(ret));

      av_frame_free(&f);
      return NULL;
    }

  return f;
}

void
transcode_frame_free(transcode_frame *frame)
{
  AVFrame *f = frame;

  av_frame_free(&f);
}


/*                                  Seeking                                  */

int
transcode_seek(struct transcode_ctx *ctx, int ms)
{
  struct decode_ctx *dec_ctx = ctx->decode_ctx;
  struct stream_ctx *s;
  int64_t start_time;
  int64_t target_pts;
  int64_t got_pts;
  int got_ms;
  int ret;

  s = &dec_ctx->audio_stream;
  if (!s->stream)
    {
      DPRINTF(E_LOG, L_XCODE, "Could not seek in non-audio input\n");
      return -1;
    }

  start_time = s->stream->start_time;

  target_pts = ms;
  target_pts = target_pts * AV_TIME_BASE / 1000;
  target_pts = av_rescale_q(target_pts, AV_TIME_BASE_Q, s->stream->time_base);

  if ((start_time != AV_NOPTS_VALUE) && (start_time > 0))
    target_pts += start_time;

  ret = av_seek_frame(dec_ctx->ifmt_ctx, s->stream->index, target_pts, AVSEEK_FLAG_BACKWARD);
  if (ret < 0)
    {
      DPRINTF(E_WARN, L_XCODE, "Could not seek into stream: %s\n", err2str(ret));
      return -1;
    }

  avcodec_flush_buffers(s->codec);

  // Fast forward until first packet with a timestamp is found
  s->codec->skip_frame = AVDISCARD_NONREF;
  while (1)
    {
      dec_ctx->timestamp = av_gettime();

      av_packet_unref(dec_ctx->packet);
      ret = av_read_frame(dec_ctx->ifmt_ctx, dec_ctx->packet);
      if (ret < 0)
	{
	  DPRINTF(E_WARN, L_XCODE, "Could not read more data while seeking: %s\n", err2str(ret));
	  s->codec->skip_frame = AVDISCARD_DEFAULT;
	  return -1;
	}

      if (stream_find(dec_ctx, dec_ctx->packet->stream_index) == AVMEDIA_TYPE_UNKNOWN)
	continue;

      // Need a pts to return the real position
      if (dec_ctx->packet->pts == AV_NOPTS_VALUE)
	continue;

      break;
    }
  s->codec->skip_frame = AVDISCARD_DEFAULT;

  // Tell read_packet() to resume with dec_ctx->packet
  dec_ctx->resume = 1;

  // Compute position in ms from pts
  got_pts = dec_ctx->packet->pts;

  if ((start_time != AV_NOPTS_VALUE) && (start_time > 0))
    got_pts -= start_time;

  got_pts = av_rescale_q(got_pts, s->stream->time_base, AV_TIME_BASE_Q);
  got_ms = got_pts / (AV_TIME_BASE / 1000);

  // Since negative return would mean error, we disallow it here
  if (got_ms < 0)
    got_ms = 0;

  DPRINTF(E_DBG, L_XCODE, "Seek wanted %d ms, got %d ms\n", ms, got_ms);

  return got_ms;
}

/*                                  Querying                                 */

int
transcode_decode_query(struct decode_ctx *ctx, const char *query)
{
  if (strcmp(query, "width") == 0)
    {
      if (ctx->video_stream.stream)
	return ctx->video_stream.stream->codecpar->width;
    }
  else if (strcmp(query, "height") == 0)
    {
      if (ctx->video_stream.stream)
	return ctx->video_stream.stream->codecpar->height;
    }
  else if (strcmp(query, "is_png") == 0)
    {
      if (ctx->video_stream.stream)
	return (ctx->video_stream.stream->codecpar->codec_id == AV_CODEC_ID_PNG);
    }
  else if (strcmp(query, "is_jpeg") == 0)
    {
      if (ctx->video_stream.stream)
	return (ctx->video_stream.stream->codecpar->codec_id == AV_CODEC_ID_MJPEG);
    }

  return -1;
}

int
transcode_encode_query(struct encode_ctx *ctx, const char *query)
{
  if (strcmp(query, "sample_rate") == 0)
    {
      if (ctx->audio_stream.stream)
	return ctx->audio_stream.stream->codecpar->sample_rate;
    }
  else if (strcmp(query, "bits_per_sample") == 0)
    {
      if (ctx->audio_stream.stream)
	return av_get_bits_per_sample(ctx->audio_stream.stream->codecpar->codec_id);
    }
  else if (strcmp(query, "channels") == 0)
    {
      if (ctx->audio_stream.stream)
#if USE_CH_LAYOUT
	return ctx->audio_stream.stream->codecpar->ch_layout.nb_channels;
#else
	return ctx->audio_stream.stream->codecpar->channels;
#endif
    }
  else if (strcmp(query, "samples_per_frame") == 0)
    {
      if (ctx->audio_stream.stream)
	return ctx->audio_stream.stream->codecpar->frame_size;
    }
  else if (strcmp(query, "estimated_size") == 0)
    {
      if (ctx->audio_stream.stream)
	return ctx->bytes_total;
    }

  return -1;
}


/*                                  Metadata                                 */

struct http_icy_metadata *
transcode_metadata(struct transcode_ctx *ctx, int *changed)
{
  struct http_icy_metadata *m;

  if (!ctx->decode_ctx->ifmt_ctx)
    return NULL;

  m = http_icy_metadata_get(ctx->decode_ctx->ifmt_ctx, 1);
  if (!m)
    return NULL;

  *changed = (m->hash != ctx->encode_ctx->icy_hash);

  ctx->encode_ctx->icy_hash = m->hash;

  return m;
}

void
transcode_metadata_strings_set(struct transcode_metadata_string *s, enum transcode_profile profile, struct media_quality *q, uint32_t len_ms)
{
  off_t bytes;

  memset(s, 0, sizeof(struct transcode_metadata_string));

  switch (profile)
    {
      case XCODE_WAV:
	s->type = "wav";
	s->codectype = "wav";
	s->description = "WAV audio file";

	snprintf(s->bitrate, sizeof(s->bitrate), "%d", 8 * STOB(q->sample_rate, q->bits_per_sample, q->channels) / 1000); // 44100/16/2 -> 1411

	bytes = size_estimate(profile, q->bit_rate, q->sample_rate, q->bits_per_sample / 8, q->channels, len_ms);
	snprintf(s->file_size, sizeof(s->file_size), "%d", (int)bytes);
	break;

      case XCODE_MP3:
	s->type = "mp3";
	s->codectype = "mpeg";
	s->description = "MPEG audio file";

	snprintf(s->bitrate, sizeof(s->bitrate), "%d", q->bit_rate / 1000);

	bytes = size_estimate(profile, q->bit_rate, q->sample_rate, q->bits_per_sample / 8, q->channels, len_ms);
	snprintf(s->file_size, sizeof(s->file_size), "%d", (int)bytes);
	break;

      case XCODE_MP4_ALAC:
	s->type = "m4a";
	s->codectype = "alac";
	s->description = "Apple Lossless audio file";

	snprintf(s->bitrate, sizeof(s->bitrate), "%d", 8 * STOB(q->sample_rate, q->bits_per_sample, q->channels) / 1000); // 44100/16/2 -> 1411

	bytes = size_estimate(profile, q->bit_rate, q->sample_rate, q->bits_per_sample / 8, q->channels, len_ms);
	snprintf(s->file_size, sizeof(s->file_size), "%d", (int)bytes);
	break;

      default:
	DPRINTF(E_WARN, L_XCODE, "transcode_metadata_strings_set() called with unknown profile %d\n", profile);
    }
}

int
transcode_prepare_header(struct evbuffer **header, enum transcode_profile profile, const char *path)
{
  int ret;

  switch (profile)
    {
      case XCODE_MP4_ALAC:
	ret = make_mp4_header(header, path);
	break;
      default:
	ret = -1;
    }

  return ret;
}