| //===--- Unicode.cpp - Unicode utilities ----------------------------------===// |
| // |
| // This source file is part of the Swift.org open source project |
| // |
| // Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors |
| // Licensed under Apache License v2.0 with Runtime Library Exception |
| // |
| // See https://swift.org/LICENSE.txt for license information |
| // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "swift/Basic/Unicode.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/Support/ConvertUTF.h" |
| |
| using namespace swift; |
| |
| // HACK: Allow support for many newer emoji by overriding behavior of ZWJ and |
| // emoji modifiers. This does not make the breaks correct for any version of |
| // Unicode, but shifts the ways in which it is incorrect to be less harmful. |
| // |
| // TODO: Remove this hack and reevaluate whether we should have any static |
| // notion of what a grapheme is. |
| // |
| // Returns true if lhs and rhs shouldn't be considered as having a grapheme |
| // break between them. That is, whether we're overriding the behavior of the |
| // hard coded Unicode 8 rules surrounding ZWJ and emoji modifiers. |
| static inline bool graphemeBreakOverride(llvm::UTF32 lhs, llvm::UTF32 rhs) { |
| // Assume ZWJ sequences produce new emoji |
| if (lhs == 0x200D) { |
| return true; |
| } |
| |
| // Permit continuing regional indicators |
| if (rhs >= 0x1F3FB && rhs <= 0x1F3FF) { |
| return true; |
| } |
| |
| // Permit emoji tag sequences |
| if (rhs >= 0xE0020 && rhs <= 0xE007F) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| StringRef swift::unicode::extractFirstExtendedGraphemeCluster(StringRef S) { |
| // Extended grapheme cluster segmentation algorithm as described in Unicode |
| // Standard Annex #29. |
| if (S.empty()) |
| return StringRef(); |
| |
| const llvm::UTF8 *SourceStart = |
| reinterpret_cast<const llvm::UTF8 *>(S.data()); |
| |
| const llvm::UTF8 *SourceNext = SourceStart; |
| llvm::UTF32 C[2]; |
| llvm::UTF32 *TargetStart = C; |
| |
| ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart, C + 1, |
| llvm::lenientConversion); |
| if (TargetStart == C) { |
| // The source string contains an ill-formed subsequence at the end. |
| return S; |
| } |
| |
| GraphemeClusterBreakProperty GCBForC0 = getGraphemeClusterBreakProperty(C[0]); |
| while (true) { |
| size_t C1Offset = SourceNext - SourceStart; |
| ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart, C + 2, |
| llvm::lenientConversion); |
| |
| if (TargetStart == C + 1) { |
| // End of source string or the source string contains an ill-formed |
| // subsequence at the end. |
| return S.slice(0, C1Offset); |
| } |
| |
| GraphemeClusterBreakProperty GCBForC1 = |
| getGraphemeClusterBreakProperty(C[1]); |
| if (isExtendedGraphemeClusterBoundary(GCBForC0, GCBForC1) && |
| !graphemeBreakOverride(C[0], C[1])) |
| return S.slice(0, C1Offset); |
| |
| C[0] = C[1]; |
| TargetStart = C + 1; |
| GCBForC0 = GCBForC1; |
| } |
| } |
| |
| static bool extractFirstUnicodeScalarImpl(StringRef S, unsigned &Scalar) { |
| if (S.empty()) |
| return false; |
| |
| const llvm::UTF8 *SourceStart = |
| reinterpret_cast<const llvm::UTF8 *>(S.data()); |
| |
| const llvm::UTF8 *SourceNext = SourceStart; |
| llvm::UTF32 C; |
| llvm::UTF32 *TargetStart = &C; |
| |
| ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart, |
| TargetStart + 1, llvm::lenientConversion); |
| if (TargetStart == &C) { |
| // The source string contains an ill-formed subsequence at the end. |
| return false; |
| } |
| |
| Scalar = C; |
| return size_t(SourceNext - SourceStart) == S.size(); |
| } |
| |
| bool swift::unicode::isSingleUnicodeScalar(StringRef S) { |
| unsigned Scalar; |
| return extractFirstUnicodeScalarImpl(S, Scalar); |
| } |
| |
| unsigned swift::unicode::extractFirstUnicodeScalar(StringRef S) { |
| unsigned Scalar; |
| bool Result = extractFirstUnicodeScalarImpl(S, Scalar); |
| assert(Result && "string does not consist of one Unicode scalar"); |
| (void)Result; |
| return Scalar; |
| } |
| |
| uint64_t swift::unicode::getUTF16Length(StringRef Str) { |
| uint64_t Length; |
| // Transcode the string to UTF-16 to get its length. |
| SmallVector<llvm::UTF16, 128> buffer(Str.size() + 1); // +1 for ending nulls. |
| const llvm::UTF8 *fromPtr = (const llvm::UTF8 *) Str.data(); |
| llvm::UTF16 *toPtr = &buffer[0]; |
| llvm::ConversionResult Result = |
| ConvertUTF8toUTF16(&fromPtr, fromPtr + Str.size(), |
| &toPtr, toPtr + Str.size(), |
| llvm::strictConversion); |
| assert(Result == llvm::conversionOK && |
| "UTF-8 encoded string cannot be converted into UTF-16 encoding"); |
| (void)Result; |
| |
| // The length of the transcoded string in UTF-16 code points. |
| Length = toPtr - &buffer[0]; |
| return Length; |
| } |