lib/Basic/Unicode.cpp - third_party/swift - Git at Google

 //===--- Unicode.cpp - Unicode utilities ----------------------------------===//
 //
 // This source file is part of the Swift.org open source project
 //
 // Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors
 // Licensed under Apache License v2.0 with Runtime Library Exception
 //
 // See http://swift.org/LICENSE.txt for license information
 // See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
 //
 //===----------------------------------------------------------------------===//

 #include "swift/Basic/Unicode.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ConvertUTF.h"

 using namespace swift;

 StringRef swift::unicode::extractFirstExtendedGraphemeCluster(StringRef S) {
   // Extended grapheme cluster segmentation algorithm as described in Unicode
   // Standard Annex #29.
   if (S.empty())
     return StringRef();

   const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());

   const UTF8 *SourceNext = SourceStart;
   UTF32 C[2];
   UTF32 *TargetStart = C;

   ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart, C + 1,
                      lenientConversion);
   if (TargetStart == C) {
     // The source string contains an ill-formed subsequence at the end.
     return S;
   }

   GraphemeClusterBreakProperty GCBForC0 = getGraphemeClusterBreakProperty(C[0]);
   while (true) {
     if (isExtendedGraphemeClusterBoundaryAfter(GCBForC0))
       return S.slice(0, SourceNext - SourceStart);

     size_t C1Offset = SourceNext - SourceStart;
     ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart, C + 2,
                        lenientConversion);

     if (TargetStart == C + 1) {
       // End of source string or the source string contains an ill-formed
       // subsequence at the end.
       return S.slice(0, C1Offset);
     }

     GraphemeClusterBreakProperty GCBForC1 =
         getGraphemeClusterBreakProperty(C[1]);
     if (isExtendedGraphemeClusterBoundary(GCBForC0, GCBForC1))
       return S.slice(0, C1Offset);

     C[0] = C[1];
     TargetStart = C + 1;
     GCBForC0 = GCBForC1;
   }
 }

 static bool extractFirstUnicodeScalarImpl(StringRef S, unsigned &Scalar) {
   if (S.empty())
     return false;

   const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());

   const UTF8 *SourceNext = SourceStart;
   UTF32 C;
   UTF32 *TargetStart = &C;

   ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
                      TargetStart + 1, lenientConversion);
   if (TargetStart == &C) {
     // The source string contains an ill-formed subsequence at the end.
     return false;
   }

   Scalar = C;
   return size_t(SourceNext - SourceStart) == S.size();
 }

 bool swift::unicode::isSingleUnicodeScalar(StringRef S) {
   unsigned Scalar;
   return extractFirstUnicodeScalarImpl(S, Scalar);
 }

 unsigned swift::unicode::extractFirstUnicodeScalar(StringRef S) {
   unsigned Scalar;
   bool Result = extractFirstUnicodeScalarImpl(S, Scalar);
   assert(Result && "string does not consist of one Unicode scalar");
   (void)Result;
   return Scalar;
 }

 uint64_t swift::unicode::getUTF16Length(StringRef Str) {
   uint64_t Length;
   // Transcode the string to UTF-16 to get its length.
   SmallVector<UTF16, 128> buffer(Str.size() + 1); // +1 for ending nulls.
   const UTF8 *fromPtr = (const UTF8 *) Str.data();
   UTF16 *toPtr = &buffer[0];
   ConversionResult Result = ConvertUTF8toUTF16(&fromPtr, fromPtr + Str.size(),
                                                &toPtr, toPtr + Str.size(),
                                                strictConversion);
   assert(Result == conversionOK &&
          "UTF-8 encoded string cannot be converted into UTF-16 encoding");
   (void)Result;

   // The length of the transcoded string in UTF-16 code points.
   Length = toPtr - &buffer[0];
   return Length;
 }
	//===--- Unicode.cpp - Unicode utilities ----------------------------------===//
	//
	// This source file is part of the Swift.org open source project
	//
	// Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors
	// Licensed under Apache License v2.0 with Runtime Library Exception
	//
	// See http://swift.org/LICENSE.txt for license information
	// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
	//
	//===----------------------------------------------------------------------===//

	#include "swift/Basic/Unicode.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/Support/ConvertUTF.h"

	using namespace swift;

	StringRef swift::unicode::extractFirstExtendedGraphemeCluster(StringRef S) {
	// Extended grapheme cluster segmentation algorithm as described in Unicode
	// Standard Annex #29.
	if (S.empty())
	return StringRef();

	const UTF8 SourceStart = reinterpret_cast<const UTF8 >(S.data());

	const UTF8 *SourceNext = SourceStart;
	UTF32 C[2];
	UTF32 *TargetStart = C;

	ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart, C + 1,
	lenientConversion);
	if (TargetStart == C) {
	// The source string contains an ill-formed subsequence at the end.
	return S;
	}

	GraphemeClusterBreakProperty GCBForC0 = getGraphemeClusterBreakProperty(C[0]);
	while (true) {
	if (isExtendedGraphemeClusterBoundaryAfter(GCBForC0))
	return S.slice(0, SourceNext - SourceStart);

	size_t C1Offset = SourceNext - SourceStart;
	ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart, C + 2,
	lenientConversion);

	if (TargetStart == C + 1) {
	// End of source string or the source string contains an ill-formed
	// subsequence at the end.
	return S.slice(0, C1Offset);
	}

	GraphemeClusterBreakProperty GCBForC1 =
	getGraphemeClusterBreakProperty(C[1]);
	if (isExtendedGraphemeClusterBoundary(GCBForC0, GCBForC1))
	return S.slice(0, C1Offset);

	C[0] = C[1];
	TargetStart = C + 1;
	GCBForC0 = GCBForC1;
	}
	}

	static bool extractFirstUnicodeScalarImpl(StringRef S, unsigned &Scalar) {
	if (S.empty())
	return false;

	const UTF8 SourceStart = reinterpret_cast<const UTF8 >(S.data());

	const UTF8 *SourceNext = SourceStart;
	UTF32 C;
	UTF32 *TargetStart = &C;

	ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
	TargetStart + 1, lenientConversion);
	if (TargetStart == &C) {
	// The source string contains an ill-formed subsequence at the end.
	return false;
	}

	Scalar = C;
	return size_t(SourceNext - SourceStart) == S.size();
	}

	bool swift::unicode::isSingleUnicodeScalar(StringRef S) {
	unsigned Scalar;
	return extractFirstUnicodeScalarImpl(S, Scalar);
	}

	unsigned swift::unicode::extractFirstUnicodeScalar(StringRef S) {
	unsigned Scalar;
	bool Result = extractFirstUnicodeScalarImpl(S, Scalar);
	assert(Result && "string does not consist of one Unicode scalar");
	(void)Result;
	return Scalar;
	}

	uint64_t swift::unicode::getUTF16Length(StringRef Str) {
	uint64_t Length;
	// Transcode the string to UTF-16 to get its length.
	SmallVector<UTF16, 128> buffer(Str.size() + 1); // +1 for ending nulls.
	const UTF8 fromPtr = (const UTF8 ) Str.data();
	UTF16 *toPtr = &buffer[0];
	ConversionResult Result = ConvertUTF8toUTF16(&fromPtr, fromPtr + Str.size(),
	&toPtr, toPtr + Str.size(),
	strictConversion);
	assert(Result == conversionOK &&
	"UTF-8 encoded string cannot be converted into UTF-16 encoding");
	(void)Result;

	// The length of the transcoded string in UTF-16 code points.
	Length = toPtr - &buffer[0];
	return Length;
	}