| /* |
| * Copyright (C) 2017 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include <gtest/gtest.h> |
| |
| #include <iconv.h> |
| |
| #define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1) |
| |
| TEST(iconv, iconv_open_EINVAL) { |
| errno = 0; |
| ASSERT_EQ(INVALID_ICONV_T, iconv_open("silly", "silly")); |
| ASSERT_EQ(EINVAL, errno); |
| errno = 0; |
| ASSERT_EQ(INVALID_ICONV_T, iconv_open("silly", "UTF-8")); |
| ASSERT_EQ(EINVAL, errno); |
| errno = 0; |
| ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "silly")); |
| ASSERT_EQ(EINVAL, errno); |
| } |
| |
| TEST(iconv, iconv_open_comparator) { |
| // Examples from http://www.unicode.org/reports/tr22/#Charset_Alias_Matching: |
| // "For example, the following names should match: "UTF-8", "utf8", "u.t.f-008", ..." |
| iconv_t c; |
| ASSERT_NE(INVALID_ICONV_T, c = iconv_open("UTF-8", "utf8")); |
| ASSERT_EQ(0, iconv_close(c)); |
| ASSERT_NE(INVALID_ICONV_T, c = iconv_open("UTF-8", "u.t.f-008")); |
| ASSERT_EQ(0, iconv_close(c)); |
| |
| // "...but not "utf-80" or "ut8"." |
| errno = 0; |
| ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "utf-80")); |
| ASSERT_EQ(EINVAL, errno); |
| errno = 0; |
| ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "ut80")); |
| ASSERT_EQ(EINVAL, errno); |
| } |
| |
| TEST(iconv, iconv_smoke) { |
| const char* utf8 = "a٦ᄀ"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80 |
| char buf[BUFSIZ] = {}; |
| |
| iconv_t c = iconv_open("UTF-32LE", "UTF-8"); |
| ASSERT_NE(INVALID_ICONV_T, c); |
| |
| char* in = const_cast<char*>(utf8); |
| size_t in_bytes = strlen(in); |
| |
| char* out = buf; |
| size_t out_bytes = sizeof(buf); |
| |
| EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes)); |
| |
| wchar_t* utf16 = reinterpret_cast<wchar_t*>(buf); |
| EXPECT_EQ(L'a', utf16[0]); |
| EXPECT_EQ(L'٦', utf16[1]); |
| EXPECT_EQ(L'ᄀ', utf16[2]); |
| EXPECT_EQ(L'\0', utf16[3]); |
| EXPECT_EQ(0U, in_bytes); |
| EXPECT_EQ(sizeof(buf) - (3 /* chars */ * 4 /* bytes each */), out_bytes); |
| |
| ASSERT_EQ(0, iconv_close(c)); |
| } |
| |
| TEST(iconv, iconv_lossy_TRANSLIT) { |
| const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80 |
| char buf[BUFSIZ] = {}; |
| |
| iconv_t c = iconv_open("ASCII//TRANSLIT", "UTF-8"); |
| ASSERT_NE(INVALID_ICONV_T, c); |
| |
| char* in = const_cast<char*>(utf8); |
| size_t in_bytes = strlen(in); |
| |
| char* out = buf; |
| size_t out_bytes = sizeof(buf); |
| |
| // Two of the input characters (5 input bytes) aren't representable as ASCII. |
| // With "//TRANSLIT", we use a replacement character, and report the number |
| // of replacements. |
| EXPECT_EQ(2U, iconv(c, &in, &in_bytes, &out, &out_bytes)); |
| |
| EXPECT_EQ('a', buf[0]); |
| EXPECT_EQ('?', buf[1]); |
| EXPECT_EQ('?', buf[2]); |
| EXPECT_EQ('z', buf[3]); |
| EXPECT_EQ(0, buf[4]); |
| EXPECT_EQ(0U, in_bytes); |
| EXPECT_EQ(sizeof(buf) - 4, out_bytes); |
| |
| ASSERT_EQ(0, iconv_close(c)); |
| } |
| |
| TEST(iconv, iconv_lossy_IGNORE) { |
| const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80 |
| char buf[BUFSIZ] = {}; |
| |
| iconv_t c = iconv_open("ASCII//IGNORE", "UTF-8"); |
| ASSERT_NE(INVALID_ICONV_T, c); |
| |
| char* in = const_cast<char*>(utf8); |
| size_t in_bytes = strlen(in); |
| |
| char* out = buf; |
| size_t out_bytes = sizeof(buf); |
| |
| // Two of the input characters (5 input bytes) aren't representable as ASCII. |
| // With "//IGNORE", we just skip them (but return failure). |
| errno = 0; |
| EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); |
| EXPECT_EQ(EILSEQ, errno); |
| |
| EXPECT_EQ('a', buf[0]); |
| EXPECT_EQ('z', buf[1]); |
| EXPECT_EQ(0, buf[2]); |
| EXPECT_EQ(0U, in_bytes); |
| EXPECT_EQ(sizeof(buf) - 2, out_bytes); |
| |
| ASSERT_EQ(0, iconv_close(c)); |
| } |
| |
| TEST(iconv, iconv_lossy) { |
| const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80 |
| char buf[BUFSIZ] = {}; |
| |
| iconv_t c = iconv_open("ASCII", "UTF-8"); |
| ASSERT_NE(INVALID_ICONV_T, c); |
| |
| char* in = const_cast<char*>(utf8); |
| size_t in_bytes = strlen(in); |
| |
| char* out = buf; |
| size_t out_bytes = sizeof(buf); |
| |
| // The second input character isn't representable as ASCII, so we stop there. |
| errno = 0; |
| EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); |
| EXPECT_EQ(EILSEQ, errno); |
| |
| EXPECT_EQ('a', buf[0]); |
| EXPECT_EQ(0, buf[1]); |
| EXPECT_EQ(6U, in_bytes); // Two bytes for ٦, three bytes for ᄀ, and one byte for z. |
| EXPECT_EQ(sizeof(buf) - 1, out_bytes); |
| |
| ASSERT_EQ(0, iconv_close(c)); |
| } |
| |
| TEST(iconv, iconv_malformed_sequence_EILSEQ) { |
| const char* utf8 = "a\xd9z"; // 0xd9 is the first byte of the two-byte U+0666 ٦. |
| char buf[BUFSIZ] = {}; |
| |
| iconv_t c = iconv_open("UTF-8", "UTF-8"); |
| ASSERT_NE(INVALID_ICONV_T, c); |
| |
| char* in = const_cast<char*>(utf8); |
| size_t in_bytes = strlen(in); |
| |
| char* out = buf; |
| size_t out_bytes = sizeof(buf); |
| |
| // The second input byte is a malformed character, so we stop there. |
| errno = 0; |
| EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); |
| EXPECT_EQ(EILSEQ, errno); |
| EXPECT_EQ('\xd9', *in); // *in is left pointing to the start of the invalid sequence. |
| ++in; |
| --in_bytes; |
| errno = 0; |
| EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes)); |
| EXPECT_EQ(0, errno); |
| |
| EXPECT_EQ('a', buf[0]); |
| EXPECT_EQ('z', buf[1]); |
| EXPECT_EQ(0, buf[2]); |
| EXPECT_EQ(0U, in_bytes); |
| EXPECT_EQ(sizeof(buf) - 2, out_bytes); |
| |
| ASSERT_EQ(0, iconv_close(c)); |
| } |
| |
| TEST(iconv, iconv_incomplete_sequence_EINVAL) { |
| const char* utf8 = "a\xd9"; // 0xd9 is the first byte of the two-byte U+0666 ٦. |
| char buf[BUFSIZ] = {}; |
| |
| iconv_t c = iconv_open("UTF-8", "UTF-8"); |
| ASSERT_NE(INVALID_ICONV_T, c); |
| |
| char* in = const_cast<char*>(utf8); |
| size_t in_bytes = strlen(in); |
| |
| char* out = buf; |
| size_t out_bytes = sizeof(buf); |
| |
| // The second input byte is just the start of a character, and we don't have any more bytes. |
| errno = 0; |
| EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); |
| EXPECT_EQ(EINVAL, errno); |
| EXPECT_EQ('\xd9', *in); // *in is left pointing to the start of the incomplete sequence. |
| |
| EXPECT_EQ('a', buf[0]); |
| EXPECT_EQ(0, buf[1]); |
| EXPECT_EQ(1U, in_bytes); |
| EXPECT_EQ(sizeof(buf) - 1, out_bytes); |
| |
| ASSERT_EQ(0, iconv_close(c)); |
| } |
| |
| TEST(iconv, iconv_E2BIG) { |
| const char* utf8 = "abc"; |
| char buf[BUFSIZ] = {}; |
| |
| iconv_t c = iconv_open("UTF-8", "UTF-8"); |
| ASSERT_NE(INVALID_ICONV_T, c); |
| |
| char* in = const_cast<char*>(utf8); |
| size_t in_bytes = strlen(in); |
| |
| char* out = buf; |
| size_t out_bytes = 1; |
| |
| // We need three bytes, so one isn't enough (but we will make progress). |
| out_bytes = 1; |
| errno = 0; |
| EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); |
| EXPECT_EQ(E2BIG, errno); |
| EXPECT_EQ(2U, in_bytes); |
| EXPECT_EQ(0U, out_bytes); |
| |
| // Two bytes left, so zero isn't enough (and we can't even make progress). |
| out_bytes = 0; |
| errno = 0; |
| EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); |
| EXPECT_EQ(E2BIG, errno); |
| EXPECT_EQ(2U, in_bytes); |
| EXPECT_EQ(0U, out_bytes); |
| |
| // Two bytes left, so one isn't enough (but we will make progress). |
| out_bytes = 1; |
| errno = 0; |
| EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); |
| EXPECT_EQ(E2BIG, errno); |
| EXPECT_EQ(1U, in_bytes); |
| EXPECT_EQ(0U, out_bytes); |
| |
| // One byte left, so one byte is now enough. |
| out_bytes = 1; |
| errno = 0; |
| EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes)); |
| EXPECT_EQ(0, errno); |
| EXPECT_EQ(0U, in_bytes); |
| EXPECT_EQ(0U, out_bytes); |
| |
| EXPECT_EQ('a', buf[0]); |
| EXPECT_EQ('b', buf[1]); |
| EXPECT_EQ('c', buf[2]); |
| EXPECT_EQ(0, buf[3]); |
| |
| ASSERT_EQ(0, iconv_close(c)); |
| } |
| |
| TEST(iconv, iconv_invalid_converter_EBADF) { |
| char* in = nullptr; |
| char* out = nullptr; |
| size_t in_bytes = 0; |
| size_t out_bytes = 0; |
| errno = 0; |
| ASSERT_EQ(static_cast<size_t>(-1), iconv(INVALID_ICONV_T, &in, &in_bytes, &out, &out_bytes)); |
| ASSERT_EQ(EBADF, errno); |
| } |
| |
| TEST(iconv, iconv_close_invalid_converter_EBADF) { |
| errno = 0; |
| ASSERT_EQ(-1, iconv_close(INVALID_ICONV_T)); |
| ASSERT_EQ(EBADF, errno); |
| } |
| |
| static void RoundTrip(const char* dst_enc, const char* expected_bytes, size_t n) { |
| // Examples from https://en.wikipedia.org/wiki/UTF-16. |
| const char* utf8 = "$€𐐷"; // U+0024, U+20AC, U+10437. |
| |
| iconv_t c = iconv_open(dst_enc, "UTF-8"); |
| ASSERT_NE(INVALID_ICONV_T, c) << dst_enc; |
| |
| char* in = const_cast<char*>(utf8); |
| size_t in_bytes = strlen(utf8); |
| char buf[BUFSIZ] = {}; |
| char* out = buf; |
| size_t out_bytes = sizeof(buf); |
| size_t replacement_count = iconv(c, &in, &in_bytes, &out, &out_bytes); |
| |
| // Check we got the bytes we were expecting. |
| for (size_t i = 0; i < n; ++i) { |
| EXPECT_EQ(expected_bytes[i], buf[i]) << i << ' '<< dst_enc; |
| } |
| |
| ASSERT_EQ(0, iconv_close(c)); |
| |
| // We can't round-trip if there were replacements. |
| if (strstr(dst_enc, "ascii")) { |
| GTEST_LOG_(INFO) << "can't round-trip " << dst_enc << "\n"; |
| return; |
| } |
| ASSERT_EQ(0U, replacement_count); |
| |
| c = iconv_open("UTF-8", dst_enc); |
| ASSERT_NE(INVALID_ICONV_T, c) << dst_enc; |
| |
| in = buf; |
| in_bytes = n; |
| char buf2[BUFSIZ] = {}; |
| out = buf2; |
| out_bytes = sizeof(buf2); |
| iconv(c, &in, &in_bytes, &out, &out_bytes); |
| |
| ASSERT_STREQ(utf8, buf2) << dst_enc; |
| |
| ASSERT_EQ(0, iconv_close(c)); |
| } |
| |
| TEST(iconv, iconv_round_trip_ascii) { |
| RoundTrip("ascii//TRANSLIT", "$??", 3); |
| } |
| |
| TEST(iconv, iconv_round_trip_utf8) { |
| RoundTrip("utf8", "\x24\xe2\x82\xac\xf0\x90\x90\xb7", 8); |
| } |
| |
| TEST(iconv, iconv_round_trip_utf16be) { |
| RoundTrip("utf16be", "\x00\x24" "\x20\xac" "\xd8\x01\xdc\x37", 8); |
| } |
| |
| TEST(iconv, iconv_round_trip_utf16le) { |
| RoundTrip("utf16le", "\x24\x00" "\xac\x20" "\x01\xd8\x37\xdc", 8); |
| } |
| |
| TEST(iconv, iconv_round_trip_utf32be) { |
| RoundTrip("utf32be", "\x00\x00\x00\x24" "\x00\x00\x20\xac" "\x00\x01\x04\x37", 12); |
| } |
| |
| TEST(iconv, iconv_round_trip_utf32le) { |
| RoundTrip("utf32le", "\x24\x00\x00\x00" "\xac\x20\x00\x00" "\x37\x04\x01\x00", 12); |
| } |
| |
| TEST(iconv, iconv_round_trip_wchar_t) { |
| RoundTrip("wchar_t", "\x24\x00\x00\x00" "\xac\x20\x00\x00" "\x37\x04\x01\x00", 12); |
| } |
| |
| static void Check(int expected_errno, const char* src_enc, const char* src, size_t n) { |
| iconv_t c = iconv_open("wchar_t", src_enc); |
| char* in = const_cast<char*>(src); |
| size_t in_bytes = n; |
| wchar_t out_buf[16]; |
| size_t out_bytes = sizeof(out_buf); |
| char* out = reinterpret_cast<char*>(out_buf); |
| errno = 0; |
| ASSERT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); |
| EXPECT_EQ(expected_errno, errno); |
| EXPECT_EQ(0, iconv_close(c)); |
| } |
| |
| TEST(iconv, iconv_EILSEQ_ascii) { |
| Check(EILSEQ, "ASCII", "\xac", 1); // > 0x7f, so not ASCII. |
| } |
| |
| TEST(iconv, iconv_EILSEQ_utf8_initial) { |
| Check(EILSEQ, "utf8", "\x82", 1); // Invalid initial byte. |
| } |
| |
| TEST(iconv, iconv_EILSEQ_utf8_non_initial) { |
| Check(EILSEQ, "utf8", "\xe2\xe2\x82", 3); // Invalid second byte. |
| } |
| |
| TEST(iconv, iconv_EILSEQ_utf16be_low_surrogate_first) { |
| Check(EILSEQ, "utf16be", "\xdc\x37" "\xd8\x01", 4); |
| } |
| |
| TEST(iconv, iconv_EILSEQ_utf16le_low_surrogate_first) { |
| Check(EILSEQ, "utf16le", "\x37\xdc" "\x01\xd8", 4); |
| } |
| |
| TEST(iconv, iconv_EINVAL_utf8_short) { |
| Check(EINVAL, "utf8", "\xe2\x82", 2); // Missing final byte of 3-byte sequence. |
| } |
| |
| TEST(iconv, iconv_EINVAL_utf16be_short) { |
| Check(EINVAL, "utf16be", "\x00", 1); // Missing second byte. |
| } |
| |
| TEST(iconv, iconv_EINVAL_utf16be_missing_low_surrogate) { |
| Check(EINVAL, "utf16be", "\xd8\x01", 2); |
| } |
| |
| TEST(iconv, iconv_EINVAL_utf16be_half_low_surrogate) { |
| Check(EINVAL, "utf16be", "\xd8\x01\xdc", 3); |
| } |
| |
| TEST(iconv, iconv_EINVAL_utf16le_short) { |
| Check(EINVAL, "utf16le", "\x24", 1); // Missing second byte. |
| } |
| |
| TEST(iconv, iconv_EINVAL_utf16le_missing_low_surrogate) { |
| Check(EINVAL, "utf16le", "\x01\xd8", 2); |
| } |
| |
| TEST(iconv, iconv_EINVAL_utf16le_half_low_surrogate) { |
| Check(EINVAL, "utf16le", "\x01\xd8\x37", 3); |
| } |
| |
| TEST(iconv, iconv_EINVAL_utf32be_short) { |
| Check(EINVAL, "utf32be", "\x00\x00\x00", 3); // Missing final byte. |
| } |
| |
| TEST(iconv, iconv_EINVAL_utf32le_short) { |
| Check(EINVAL, "utf32le", "\x24\x00\x00", 3); // Missing final byte. |
| } |
| |
| TEST(iconv, iconv_initial_shift_state) { |
| // POSIX: "For state-dependent encodings, the conversion descriptor |
| // cd is placed into its initial shift state by a call for which inbuf |
| // is a null pointer, or for which inbuf points to a null pointer." |
| iconv_t c = iconv_open("utf8", "utf8"); |
| char* in = nullptr; |
| size_t in_bytes = 0; |
| wchar_t out_buf[16]; |
| size_t out_bytes = sizeof(out_buf); |
| char* out = reinterpret_cast<char*>(out_buf); |
| |
| // Points to a null pointer... |
| errno = 0; |
| ASSERT_EQ(static_cast<size_t>(0), iconv(c, &in, &in_bytes, &out, &out_bytes)); |
| EXPECT_EQ(0, errno); |
| EXPECT_EQ(sizeof(out_buf), out_bytes); |
| |
| // Is a null pointer... |
| errno = 0; |
| ASSERT_EQ(static_cast<size_t>(0), iconv(c, nullptr, &in_bytes, &out, &out_bytes)); |
| EXPECT_EQ(0, errno); |
| EXPECT_EQ(sizeof(out_buf), out_bytes); |
| |
| // Is a null pointer and so is in_bytes. This isn't specified by POSIX, but |
| // glibc and macOS both allow that, where Android historically didn't. |
| // https://issuetracker.google.com/180598400 |
| errno = 0; |
| ASSERT_EQ(static_cast<size_t>(0), iconv(c, nullptr, nullptr, &out, &out_bytes)); |
| EXPECT_EQ(0, errno); |
| EXPECT_EQ(sizeof(out_buf), out_bytes); |
| |
| EXPECT_EQ(0, iconv_close(c)); |
| } |