openvpn3/openvpn/common/unicode.hpp

//    OpenVPN -- An application to securely tunnel IP networks
//               over a single port, with support for SSL/TLS-based
//               session authentication and key exchange,
//               packet encryption, packet authentication, and
//               packet compression.
//
//    Copyright (C) 2012-2015 OpenVPN Technologies, Inc.
//
//    This program is free software: you can redistribute it and/or modify
//    it under the terms of the GNU Affero General Public License Version 3
//    as published by the Free Software Foundation.
//
//    This program is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Affero General Public License for more details.
//
//    You should have received a copy of the GNU Affero General Public License
//    along with this program in the COPYING file.
//    If not, see <http://www.gnu.org/licenses/>.

// General-purpose function for dealing with unicode.

#ifndef OPENVPN_COMMON_UNICODE_H
#define OPENVPN_COMMON_UNICODE_H

#include <string>
#include <cstring>           // for std::memcpy
#include <algorithm>         // for std::min
#include <memory>
#include <cctype>

#include <openvpn/common/size.hpp>
#include <openvpn/common/exception.hpp>
#include <openvpn/common/unicode-impl.hpp>
#include <openvpn/buffer/buffer.hpp>

namespace openvpn {
  namespace Unicode {

    OPENVPN_SIMPLE_EXCEPTION(unicode_src_overflow);
    OPENVPN_SIMPLE_EXCEPTION(unicode_dest_overflow);
    OPENVPN_SIMPLE_EXCEPTION(unicode_malformed);

    // Return true if the given buffer is a valid UTF-8 string.
    // Extra constraints:
    enum {
      UTF8_NO_CTRL  = (1<<30), // no control chars allowed
      UTF8_NO_SPACE = (1<<31), // no space chars allowed
    };
    inline bool is_valid_utf8_uchar_buf(const unsigned char *source,
					size_t size,
					const size_t max_len_flags=0) // OR max length (or 0 to disable) with UTF8_x flags above
    {
      const size_t max_len = max_len_flags & ((size_t)UTF8_NO_CTRL-1); // NOTE -- use smallest flag value here
      size_t unicode_len = 0;
      while (size)
	{
	  const unsigned char c = *source;
	  if (c == '\0')
	    return false;
	  const int length = trailingBytesForUTF8[c]+1;
	  if ((size_t)length > size)
	    return false;
	  if (!isLegalUTF8(source, length))
	    return false;
	  if (length == 1)
	    {
	      if ((max_len_flags & UTF8_NO_CTRL) && std::iscntrl(c))
		return false;
	      if ((max_len_flags & UTF8_NO_SPACE) && std::isspace(c))
		return false;
	    }

	  source += length;
	  size -= length;
	  ++unicode_len;
	  if (max_len && unicode_len > max_len)
	    return false;
	}
      return true;
    }

    template <typename STRING>
    inline bool is_valid_utf8(const STRING& str, const size_t max_len_flags=0)
    {
      return is_valid_utf8_uchar_buf((const unsigned char *)str.c_str(), str.length(), max_len_flags);
    }

    // Return the byte position in the string that corresponds with
    // the given character index.  Return values:
    enum {
      UTF8_GOOD=0, // succeeded, result in index
      UTF8_BAD,    // failed, string is not legal UTF8
      UTF8_RANGE,  // failed, index is beyond end of string
    };
    template <typename STRING>
    inline int utf8_index(STRING& str, size_t& index)
    {
      const size_t size = str.length();
      size_t upos = 0;
      size_t pos = 0;
      while (pos < size)
	{
	  const int len = trailingBytesForUTF8[(unsigned char)str[pos]]+1;
	  if (pos + len > size || !isLegalUTF8((const unsigned char *)&str[pos], len))
	    return UTF8_BAD;
	  if (upos >= index)
	    {
	      index = pos;
	      return UTF8_GOOD;
	    }
	  pos += len;
	  ++upos;
	}
      return UTF8_RANGE;
    }

    // Truncate a UTF8 string if its length exceeds max_len
    template <typename STRING>
    inline void utf8_truncate(STRING& str, size_t max_len)
    {
      const int status = utf8_index(str, max_len);
      if (status == UTF8_GOOD || status == UTF8_BAD)
	str = str.substr(0, max_len);
    }

    // Return a printable UTF-8 string, where bad UTF-8 chars and
    // control chars are mapped to '?'.
    // If max_len_flags > 0, print a maximum of max_len_flags chars.
    // If UTF8_PASS_FMT flag is set in max_len_flags, pass through \r\n\t
    enum {
      UTF8_PASS_FMT=(1<<31),
      UTF8_FILTER=(1<<30),
    };
    template <typename STRING>
    inline STRING utf8_printable(const STRING& str, size_t max_len_flags)
    {
      STRING ret;
      const size_t size = str.length();
      const size_t max_len = max_len_flags & ((size_t)UTF8_FILTER-1); // NOTE -- use smallest flag value here
      size_t upos = 0;
      size_t pos = 0;
      ret.reserve(std::min(str.length(), max_len) + 3); // add 3 for "..."
      while (pos < size)
	{
	  if (!max_len || upos < max_len)
	    {
	      unsigned char c = str[pos];
	      int len = trailingBytesForUTF8[c]+1;
	      if (pos + len <= size
		  && c >= 0x20 && c != 0x7F
		  && isLegalUTF8((const unsigned char *)&str[pos], len))
		{
		  // non-control, legal UTF-8
		  ret.append(str, pos, len);
		}
	      else
		{
		  // control char or bad UTF-8 char
		  if (c == '\r' || c == '\n' || c == '\t')
		    {
		      if (!(max_len_flags & UTF8_PASS_FMT))
			c = ' ';
		    }
		  else if (max_len_flags & UTF8_FILTER)
		    c = 0;
		  else
		    c = '?';
		  if (c)
		    ret += c;
		  len = 1;
		}
	      pos += len;
	      ++upos;
	    }
	  else
	    {
	      ret.append("...");
	      break;
	    }
	}
      return ret;
    }

    template <typename STRING>
    inline size_t utf8_length(const STRING& str)
    {
      const size_t size = str.length();
      size_t upos = 0;
      size_t pos = 0;
      while (pos < size)
	{
	  int len = std::min((int)trailingBytesForUTF8[(unsigned char)str[pos]]+1,
			     (int)size);
	  if (!isLegalUTF8((const unsigned char *)&str[pos], len))
	    len = 1;
	  pos += len;
	  ++upos;
	}
      return upos;
    }

    inline void conversion_result_throw(const ConversionResult res)
    {
      switch (res)
	{
	case conversionOK:
	  return;
	case sourceExhausted:
	  throw unicode_src_overflow();
	case targetExhausted:
	  throw unicode_dest_overflow();
	case sourceIllegal:
	  throw unicode_malformed();
	}
    }

    // Convert a UTF-8 string to UTF-16 little endian (no null termination in return)
    template <typename STRING>
    inline BufferPtr string_to_utf16(const STRING& str)
    {
      std::unique_ptr<UTF16[]> utf16_dest(new UTF16[str.length()]);
      const UTF8 *src = (UTF8 *)str.c_str();
      UTF16 *dest = utf16_dest.get();
      const ConversionResult res = ConvertUTF8toUTF16(&src, src + str.length(),
						      &dest, dest + str.length(),
						      lenientConversion);
      conversion_result_throw(res);
      BufferPtr ret(new BufferAllocated((dest - utf16_dest.get()) * 2, BufferAllocated::ARRAY));
      UTF8 *d = ret->data();
      for (const UTF16 *s = utf16_dest.get(); s < dest; ++s)
	{
	  *d++ = *s & 0xFF;
	  *d++ = (*s >> 8) & 0xFF;
	}
      return ret;
    }

    class UTF8Iterator
    {
    public:
      struct Char
      {
	unsigned int len;
	unsigned char data[4];
	bool valid;

	const bool is_valid() const
	{
	  return valid && len >= 1 && len <= sizeof(data);
	}

	std::string str(const char *malformed)
	{
	  if (is_valid())
	    return std::string((char *)data, len);
	  else
	    return malformed;
	}
      };

      UTF8Iterator(const std::string& str_arg)
	: str((unsigned char *)str_arg.c_str()),
	  size(str_arg.length())
      {
      }

      bool get(Char &c)
      {
	if (size)
	  {
	    unsigned int len = std::min((unsigned int)trailingBytesForUTF8[*str]+1,
					(unsigned int)size);
	    if (isLegalUTF8(str, len))
	      {
		c.valid = true;
		c.len = std::min(len, (unsigned int)sizeof(c.data));
		std::memcpy(c.data, str, c.len);
	      }
	    else
	      {
		c.valid = false;
		c.len = 1;
	      }
	    str += c.len;
	    size -= c.len;
	    return true;
	  }
	else
	  return false;
      }

    private:
      const unsigned char *str;
      unsigned int size;
    };
  }
}

#endif