diff --git a/DOCS/man/en/options.rst b/DOCS/man/en/options.rst index 83ceb15c49..aae283d213 100644 --- a/DOCS/man/en/options.rst +++ b/DOCS/man/en/options.rst @@ -2031,9 +2031,9 @@ ``--subcp=enca::`` You can specify your language using a two letter language code to make - ENCA detect the codepage automatically. If unsure, enter anything and - watch mpv ``-v`` output for available languages. Fallback codepage - specifies the codepage to use, when autodetection fails. + ENCA detect the codepage automatically. If unsure, enter anything (if the + language is invalid, mpv will complain and list valid languages). + Fallback codepage specifies the codepage to use if autodetection fails. *EXAMPLE*: @@ -2041,6 +2041,8 @@ are Czech, fall back on latin 2, if the detection fails. - ``--subcp=enca:pl:cp1250`` guess the encoding for Polish, fall back on cp1250. + - ``--subcp=enca:pl`` guess the encoding for Polish, fall back on UTF-8. + - ``--subcp=enca`` try universal detection, fall back on UTF-8. --sub-delay= Delays subtitles by seconds. Can be negative. diff --git a/Makefile b/Makefile index 1fa78d676b..1c4c65a098 100644 --- a/Makefile +++ b/Makefile @@ -170,6 +170,7 @@ SOURCES = talloc.c \ core/av_log.c \ core/av_opts.c \ core/bstr.c \ + core/charset_conv.c \ core/codecs.c \ core/command.c \ core/cpudetect.c \ diff --git a/core/charset_conv.c b/core/charset_conv.c new file mode 100644 index 0000000000..15209b30ea --- /dev/null +++ b/core/charset_conv.c @@ -0,0 +1,240 @@ +/* + * This file is part of mpv. + * + * Based on code taken from libass (ISC license), which was originally part + * of MPlayer (GPL). + * Copyright (C) 2006 Evgeniy Stepanov + * + * mpv is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with mpv. If not, see . + */ + +#include +#include +#include + +#include "config.h" + +#include "core/mp_msg.h" + +#ifdef CONFIG_ENCA +#include +#endif + +#ifdef CONFIG_ICONV +#include +#endif + +#include "charset_conv.h" + +// Split the string on ':' into components. +// out_arr is at least max entries long. +// Return number of out_arr entries filled. +static int split_colon(const char *user_cp, int max, bstr *out_arr) +{ + if (!user_cp || max < 1) + return 0; + + int count = 0; + while (1) { + const char *next = strchr(user_cp, ':'); + if (next && max - count > 1) { + out_arr[count++] = (bstr){(char *)user_cp, next - user_cp}; + user_cp = next + 1; + } else { + out_arr[count++] = (bstr){(char *)user_cp, strlen(user_cp)}; + break; + } + } + return count; +} + +// Returns true if user_cp implies that calling mp_charset_guess() on the +// input data is required to determine the real codepage. This is the case +// if user_cp is not a real iconv codepage, but a magic value that requests +// for example ENCA charset auto-detection. +bool mp_charset_requires_guess(const char *user_cp) +{ + bstr res[2] = {{0}}; + split_colon(user_cp, 2, res); + return bstrcasecmp0(res[0], "enca") == 0; +} + +#ifdef CONFIG_ENCA +static const char *enca_guess(bstr buf, const char *language) +{ + if (!language || !language[0]) + language = "__"; // neutral language + + const char *detected_cp = NULL; + + EncaAnalyser analyser = enca_analyser_alloc(language); + if (analyser) { + enca_set_termination_strictness(analyser, 0); + EncaEncoding enc = enca_analyse_const(analyser, buf.start, buf.len); + const char *tmp = enca_charset_name(enc.charset, ENCA_NAME_STYLE_ICONV); + if (tmp && enc.charset != ENCA_CS_UNKNOWN) + detected_cp = tmp; + enca_analyser_free(analyser); + } else { + mp_msg(MSGT_SUBREADER, MSGL_ERR, "ENCA doesn't know language '%s'\n", + language); + size_t langcnt; + const char **languages = enca_get_languages(&langcnt); + mp_msg(MSGT_SUBREADER, MSGL_ERR, "ENCA supported languages:"); + for (int i = 0; i < langcnt; i++) + mp_msg(MSGT_SUBREADER, MSGL_ERR, " %s", languages[i]); + mp_msg(MSGT_SUBREADER, MSGL_ERR, "\n"); + free(languages); + } + + return detected_cp; +} +#endif + +// Runs charset auto-detection on the input buffer, and returns the result. +// If auto-detection fails, NULL is returned. +// If user_cp doesn't refer to any known auto-detection (for example because +// it's a real iconv codepage), user_cp is returned without even looking at +// the buf data. +const char *mp_charset_guess(bstr buf, const char *user_cp) +{ + if (!mp_charset_requires_guess(user_cp)) + return user_cp; + + bstr params[3] = {{0}}; + split_colon(user_cp, 3, params); + + bstr type = params[0]; + char lang[100]; + snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1])); + const char *fallback = params[2].start; // last item, already 0-terminated + + const char *res = NULL; + +#ifdef CONFIG_ENCA + if (bstrcasecmp0(type, "enca") == 0) + res = enca_guess(buf, lang); +#endif + + if (res) { + mp_msg(MSGT_SUBREADER, MSGL_DBG2, "%.*s detected charset: '%s'\n", + BSTR_P(type), res); + } else { + res = fallback; + mp_msg(MSGT_SUBREADER, MSGL_DBG2, + "Detection with %.*s failed: fallback to %s\n", + BSTR_P(type), res && res[0] ? res : "no conversion"); + } + + return res; +} + +// Convert the data in buf to UTF-8. The charset argument can be an iconv +// codepage, a value returned by mp_charset_conv_guess(), or a special value +// that triggers autodetection of the charset (e.g. using ENCA). +// The auto-detection is the only difference to mp_iconv_to_utf8(). +// buf: same as mp_iconv_to_utf8() +// user_cp: iconv codepage, special value, NULL +// flags: same as mp_iconv_to_utf8() +// returns: same as mp_iconv_to_utf8() +bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags) +{ + return mp_iconv_to_utf8(buf, mp_charset_guess(buf, user_cp), flags); +} + +// Use iconv to convert buf to UTF-8. +// Returns buf.start==NULL on error. Returns buf if cp is NULL, or if there is +// obviously no conversion required (e.g. if cp is "UTF-8"). +// Returns a newly allocated buffer if conversion is done and succeeds. The +// buffer will be terminated with 0 for convenience (the terminating 0 is not +// included in the returned length). +// Free the returned buffer with talloc_free(). +// buf: input data +// cp: iconv codepage (or NULL) +// flags: combination of MP_ICONV_* flags +// returns: buf (no conversion), .start==NULL (error), or allocated buffer +bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags) +{ +#ifdef CONFIG_ICONV + const char *tocp = "UTF-8"; + + if (!cp || !cp[0] || strcasecmp(cp, tocp) == 0) + return buf; + + if (strcasecmp(cp, "ASCII") == 0) + return buf; + + iconv_t icdsc; + if ((icdsc = iconv_open(tocp, cp)) == (iconv_t) (-1)) { + if (flags & MP_ICONV_VERBOSE) + mp_msg(MSGT_SUBREADER, MSGL_ERR, + "Error opening iconv with codepage '%s'\n", cp); + goto failure; + } + + size_t size = buf.len; + size_t osize = size; + size_t ileft = size; + size_t oleft = size - 1; + + char *outbuf = talloc_size(NULL, osize); + char *ip = buf.start; + char *op = outbuf; + + while (1) { + int clear = 0; + size_t rc; + if (ileft) + rc = iconv(icdsc, &ip, &ileft, &op, &oleft); + else { + clear = 1; // clear the conversion state and leave + rc = iconv(icdsc, NULL, NULL, &op, &oleft); + } + if (rc == (size_t) (-1)) { + if (errno == E2BIG) { + size_t offset = op - outbuf; + outbuf = talloc_realloc_size(NULL, outbuf, osize + size); + op = outbuf + offset; + osize += size; + oleft += size; + } else { + if (errno == EINVAL && (flags & MP_ICONV_ALLOW_CUTOFF)) { + // This is intended for cases where the input buffer is cut + // at a random byte position. If this happens in the middle + // of the buffer, it should still be an error. We say it's + // fine if the error is within 10 bytes of the end. + if (ileft <= 10) + break; + } + if (flags & MP_ICONV_VERBOSE) { + mp_msg(MSGT_SUBREADER, MSGL_ERR, + "Error recoding text with codepage '%s'\n", cp); + } + talloc_free(outbuf); + iconv_close(icdsc); + goto failure; + } + } else if (clear) + break; + } + + iconv_close(icdsc); + + outbuf[osize - oleft - 1] = 0; + return (bstr){outbuf, osize - oleft - 1}; +#endif + +failure: + return (bstr){0}; +} diff --git a/core/charset_conv.h b/core/charset_conv.h new file mode 100644 index 0000000000..00a2658da3 --- /dev/null +++ b/core/charset_conv.h @@ -0,0 +1,17 @@ +#ifndef MP_CHARSET_CONV_H +#define MP_CHARSET_CONV_H + +#include +#include "core/bstr.h" + +enum { + MP_ICONV_VERBOSE = 1, // print errors instead of failing silently + MP_ICONV_ALLOW_CUTOFF = 2, // allow partial input data +}; + +bool mp_charset_requires_guess(const char *user_cp); +const char *mp_charset_guess(bstr buf, const char *user_cp); +bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags); +bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags); + +#endif diff --git a/sub/dec_sub.c b/sub/dec_sub.c index 54f3c1ebfe..2b4bfc2e8d 100644 --- a/sub/dec_sub.c +++ b/sub/dec_sub.c @@ -18,6 +18,7 @@ #include #include +#include #include #include "config.h" @@ -27,6 +28,7 @@ #include "dec_sub.h" #include "core/options.h" #include "core/mp_msg.h" +#include "core/charset_conv.h" extern const struct sd_functions sd_ass; extern const struct sd_functions sd_lavc; @@ -56,6 +58,7 @@ struct dec_sub { struct sd init_sd; double video_fps; + const char *charset; struct sd *sd[MAX_NUM_SD]; int num_sd; @@ -196,6 +199,37 @@ void sub_init_from_sh(struct dec_sub *sub, struct sh_sub *sh) sh->gsh->codec ? sh->gsh->codec : ""); } +static const char *guess_sub_cp(struct packet_list *subs, const char *usercp) +{ + if (!mp_charset_requires_guess(usercp)) + return usercp; + + // Concat all subs into a buffer. We can't probably do much better without + // having the original data (which we don't, not anymore). + int max_size = 2 * 1024 * 1024; + const char *sep = "\n\n"; // In utf-16: U+0A0A GURMUKHI LETTER UU + int sep_len = strlen(sep); + int num_pkt = 0; + int size = 0; + for (int n = 0; n < subs->num_packets; n++) { + struct demux_packet *pkt = subs->packets[n]; + if (size + pkt->len > max_size) + break; + size += pkt->len + sep_len; + num_pkt++; + } + bstr text = {talloc_size(NULL, size), 0}; + for (int n = 0; n < num_pkt; n++) { + struct demux_packet *pkt = subs->packets[n]; + memcpy(text.start + text.len, pkt->buffer, pkt->len); + memcpy(text.start + text.len + pkt->len, sep, sep_len); + text.len += pkt->len + sep_len; + } + const char *guess = mp_charset_guess(text, usercp); + talloc_free(text.start); + return guess; +} + static void multiply_timings(struct packet_list *subs, double factor) { for (int n = 0; n < subs->num_packets; n++) { @@ -262,6 +296,7 @@ bool sub_read_all_packets(struct dec_sub *sub, struct sh_sub *sh) if (!sub_accept_packets_in_advance(sub) || sh->track) return false; + const char *codec = sh->gsh->codec ? sh->gsh->codec : ""; void *tmp = talloc_new(NULL); struct packet_list subs = {0}; @@ -275,6 +310,14 @@ bool sub_read_all_packets(struct dec_sub *sub, struct sh_sub *sh) MP_TARRAY_APPEND(tmp, subs.packets, subs.num_packets, pkt); } + // Can't run auto-detection on movtext packets: it's the only codec that + // even though it decodes to text has binary input data. + if (opts->sub_cp && strcmp(codec, "movtext") != 0) + sub->charset = guess_sub_cp(&subs, opts->sub_cp); + + if (sub->charset) + mp_msg(MSGT_OSD, MSGL_INFO, "Using subtitle charset: %s\n", sub->charset); + // 23.976 FPS is used as default timebase for frame based formats if (sub->video_fps && sh->frame_based) multiply_timings(&subs, sub->video_fps / 23.976); @@ -313,10 +356,34 @@ static void decode_next(struct dec_sub *sub, int n, struct demux_packet *packet) } } +static struct demux_packet *recode_packet(struct demux_packet *in, + const char *charset) +{ + struct demux_packet *pkt = NULL; + bstr in_buf = {in->buffer, in->len}; + bstr conv = mp_iconv_to_utf8(in_buf, charset, MP_ICONV_VERBOSE); + if (conv.start && conv.start != in_buf.start) { + pkt = talloc_ptrtype(NULL, pkt); + talloc_steal(pkt, conv.start); + *pkt = (struct demux_packet) { + .buffer = conv.start, + .len = conv.len, + .pts = in->pts, + .duration = in->duration, + .avpacket = in->avpacket, // questionable, but gives us sidedata + }; + } + return pkt; +} + void sub_decode(struct dec_sub *sub, struct demux_packet *packet) { - if (sub->num_sd > 0) - decode_next(sub, 0, packet); + if (sub->num_sd > 0) { + struct demux_packet *recoded = NULL; + if (sub->charset) + recoded = recode_packet(packet, sub->charset); + decode_next(sub, 0, recoded ? recoded : packet); + } } void sub_get_bitmaps(struct dec_sub *sub, struct mp_osd_res dim, double pts,