Merge pull request #10693 from milseman/zalgorithmic_complexity [stdlib] Update non-contiguous NSStrings to Unicode 9

commit: 091738cf9758bed3bdecaac50a8af45b76546a3d [log] [tgz]
author: Michael Ilseman <michael.ilseman@gmail.com> Fri Jun 30 14:13:39 2017 -0700
committer: GitHub <noreply@github.com> Fri Jun 30 14:13:39 2017 -0700
tree: ab1ac27e7c23223b44ce73c492086b84ff6c2103
parent: 2eebb1a8e1bc898ec02d8e695c9f58607c6f0ac7 [diff]
parent: b6ac8c61abe05b0bba9a6fc793ef07fd282bd7cb [diff]
diff --git a/stdlib/public/core/CMakeLists.txt b/stdlib/public/core/CMakeLists.txt
index 4e8683b..005e0bf 100644
--- a/stdlib/public/core/CMakeLists.txt
+++ b/stdlib/public/core/CMakeLists.txt

@@ -55,6 +55,7 @@
   ErrorType.swift
   Existential.swift
   Filter.swift.gyb
+  FixedArray.swift.gyb
   FlatMap.swift
   Flatten.swift.gyb
   FloatingPoint.swift.gyb

diff --git a/stdlib/public/core/FixedArray.swift.gyb b/stdlib/public/core/FixedArray.swift.gyb
new file mode 100644
index 0000000..4fa02db
--- /dev/null
+++ b/stdlib/public/core/FixedArray.swift.gyb

@@ -0,0 +1,113 @@
+//===--- FixedArray.swift.gyb ---------------------------------*- swift -*-===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
+//
+//===----------------------------------------------------------------------===//
+//
+//  A helper struct to provide fixed-sized array like functionality
+//
+//===----------------------------------------------------------------------===//
+
+%{
+  # The sizes to generate code for.
+  sizes = [16]
+}%
+
+% for N in sizes:
+
+internal struct _FixedArray${N}<T> {
+  // ABI TODO: The has assumptions about tuple layout in the ABI, namely that
+  // they are laid out contiguously and individually addressable (i.e. strided).
+  //
+  internal var storage: (
+    // A ${N}-wide tuple of type T
+% for i in range(0, N-1):
+    T,
+% end
+    T
+  )
+
+  static var _arraySize : Int { return ${N} }
+}
+
+extension _FixedArray${N} : RandomAccessCollection, MutableCollection {
+  internal typealias Index = Int
+  internal typealias IndexDistance = Int
+
+  internal var startIndex : Index {
+    return 0
+  }
+
+  internal var endIndex : Index {
+    return _FixedArray${N}._arraySize
+  }
+
+  internal var count : IndexDistance { return _FixedArray${N}._arraySize }
+
+  internal subscript(i: Index) -> T {
+    @_versioned
+    @inline(__always)
+    get {
+      var copy = storage
+      let res: T = withUnsafeBytes(of: &copy) {
+        (rawPtr : UnsafeRawBufferPointer) -> T in
+        let stride = MemoryLayout<T>.stride
+        _sanityCheck(rawPtr.count == ${N}*stride, "layout mismatch?")
+        let bufPtr = UnsafeBufferPointer(
+          start: rawPtr.baseAddress!.assumingMemoryBound(to: T.self),
+          count: count)
+        return bufPtr[i]
+      }
+      return res
+    }
+    @_versioned
+    @inline(__always)
+    set {
+      withUnsafeBytes(of: &storage) {
+        (rawPtr : UnsafeRawBufferPointer) -> () in
+        let rawPtr = UnsafeMutableRawBufferPointer(mutating: rawPtr)
+        let stride = MemoryLayout<T>.stride
+        _sanityCheck(rawPtr.count == ${N}*stride, "layout mismatch?")
+        let bufPtr = UnsafeMutableBufferPointer(
+          start: rawPtr.baseAddress!.assumingMemoryBound(to: T.self),
+          count: count)
+        bufPtr[i] = newValue
+      }
+    }
+  }
+
+  @_versioned
+  @inline(__always)
+  internal func index(after i: Index) -> Index {
+    return i+1
+  }
+
+  @_versioned
+  @inline(__always)
+  internal func index(before i: Index) -> Index {
+    return i-1
+  }
+
+  // TODO: Any customization hooks it's profitable to override, e.g. append?
+
+}
+
+extension _FixedArray${N} where T: IntegerLiteralConvertible {
+  @inline(__always)
+  internal init(allZeros:()) {
+    self.storage = (
+% for i in range(0, N-1):
+    0,
+% end
+    0
+    )
+  }
+}
+
+% end

diff --git a/stdlib/public/core/GroupInfo.json b/stdlib/public/core/GroupInfo.json
index c4c41db..f816522 100644
--- a/stdlib/public/core/GroupInfo.json
+++ b/stdlib/public/core/GroupInfo.json

@@ -92,6 +92,7 @@
         "Arrays.swift",
         "CocoaArray.swift",
         "ContiguousArrayBuffer.swift",
+        "FixedArray.swift",
         "SliceBuffer.swift",
         "SwiftNativeNSArray.swift"],
       "HashedCollections": [

diff --git a/stdlib/public/core/StringCharacterView.swift b/stdlib/public/core/StringCharacterView.swift
index debf0ba..13169af 100644
--- a/stdlib/public/core/StringCharacterView.swift
+++ b/stdlib/public/core/StringCharacterView.swift

@@ -15,8 +15,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FIXME(ABI)#70 : The character string view should have a custom iterator type to
-// allow performance optimizations of linear traversals.
+// FIXME(ABI)#70 : The character string view should have a custom iterator type
+// to allow performance optimizations of linear traversals.
 
 /// CR and LF are common special cases in grapheme breaking logic
 @_versioned internal var _CR: UInt8 { return 0x0d }
@@ -102,7 +102,8 @@
   /// of the string.
   ///
   ///     var str = "All this happened, more or less."
-  ///     let afterSpace = str.withMutableCharacters { chars -> String.CharacterView in
+  ///     let afterSpace = str.withMutableCharacters {
+  ///         chars -> String.CharacterView in
   ///         if let i = chars.index(of: " ") {
   ///             let result = chars[chars.index(after: i)...]
   ///             chars.removeSubrange(i...)
@@ -364,128 +365,130 @@
   internal func _measureExtendedGraphemeClusterForward(
     from start: UnicodeScalarView.Index
   ) -> Int {
-    let end = unicodeScalars.endIndex
-    if start == end {
+    let startPosition = start._position
+    let endPosition = unicodeScalars.endIndex._position
+
+    // No more graphemes
+    if startPosition == endPosition {
       return 0
     }
 
-    // Our relative position (offset). If our _core is not a substring, this is
-    // the same as start._position.
-    let relativeOffset = start._position - _coreOffset
+    // Last code unit means final grapheme length of 1
+    if startPosition == endPosition - 1 {
+      return 1
+    }
+
+    // Our relative offset from the _StringCore's baseAddress pointer. If our
+    // _core is not a substring, this is the same as start._position. Otherwise,
+    // it is the code unit relative offset into the substring and not the
+    // absolute offset into the outer string.
+    let startOffset = startPosition - _coreOffset
 
     // Grapheme breaking is much simpler if known ASCII
     if _core.isASCII {
       _onFastPath() // Please aggressively inline
       let asciiBuffer = _core.asciiBuffer._unsafelyUnwrappedUnchecked
+      _sanityCheck(startOffset+1 < asciiBuffer.endIndex, 
+        "Already checked for last code unit")
 
       // With the exception of CR-LF, ASCII graphemes are single-scalar. Check
       // for that one exception.
       if _slowPath(
-        asciiBuffer[relativeOffset] == _CR &&
-        relativeOffset+1 < asciiBuffer.endIndex &&
-        asciiBuffer[relativeOffset+1] == _LF
+        asciiBuffer[startOffset] == _CR &&
+        asciiBuffer[startOffset+1] == _LF
       ) {
         return 2
       }
 
       return 1
-    } else {
-      // TODO: Check for (potentially non-contiguous) ASCII NSStrings,
-      // especially small tagged pointers.
     }
     
-    let startIndexUTF16 = start._position
-
-    // Last scalar is its own grapheme
-    if (startIndexUTF16+1 == end._position) {
+    // Perform a quick single-code-unit grapheme check.
+    if _fastPath(String.CharacterView._quickCheckGraphemeBreakBetween(
+        _core[startOffset],
+        _core[startOffset+1])
+    ) {
       return 1
     }
 
-    // Perform a quick single-code-unit grapheme check
-    if _core._baseAddress != nil {
-      if String.CharacterView._quickCheckGraphemeBreakBetween(
-        _core._nthContiguous(relativeOffset),
-        _core._nthContiguous(relativeOffset+1)
-      ) {
-        return 1
-      }
-    } else {
-      // TODO: Check for (potentially non-contiguous) UTF16 NSStrings,
-      // especially small tagged pointers
-    }
-    return _measureExtendedGraphemeClusterForwardSlow(
-      relativeOffset: relativeOffset,
-      start: start,
-      end: end,
-      startIndexUTF16: startIndexUTF16
-    )
+    return _measureExtendedGraphemeClusterForwardSlow(startOffset: startOffset)
   }
   
   @inline(never)
   @_versioned
   func _measureExtendedGraphemeClusterForwardSlow(
-    relativeOffset: Int,
-    start: String.UnicodeScalarView.Index,
-    end: String.UnicodeScalarView.Index,
-    startIndexUTF16: Int
+    startOffset: Int
   ) -> Int {
-    if _core._baseAddress != nil {
+    let endOffset = unicodeScalars.endIndex._position - _coreOffset
+    let numCodeUnits = endOffset - startOffset
+    _sanityCheck(numCodeUnits >= 2, "should have at least two code units")
+
+    // The vast majority of time, we can get a pointer and a length directly
+    if _fastPath(_core._baseAddress != nil) {
       _onFastPath() // Please aggressively inline
       let breakIterator = _ThreadLocalStorage.getUBreakIterator(for: _core)
-      let ubrkFollowing = __swift_stdlib_ubrk_following(
-        breakIterator, Int32(relativeOffset)
+      let ubrkFollowingOffset = __swift_stdlib_ubrk_following(
+        breakIterator, Int32(startOffset)
       )
       // ubrk_following may return UBRK_DONE (-1). Treat that as the rest of the
       // string.
-      let nextPosition =
-        ubrkFollowing == -1 ? end._position : Int(ubrkFollowing)
-      return nextPosition - relativeOffset
-    } else {
-      // TODO: See if we can get fast character contents.
+      if _slowPath(ubrkFollowingOffset == -1) {
+        return numCodeUnits
+      }
+      _sanityCheck(startOffset != Int(ubrkFollowingOffset), 
+        "zero-sized grapheme?")
+      return Int(ubrkFollowingOffset) - startOffset
     }
 
-    // FIXME: Need to handle the general case correctly with Unicode 9+
-    // semantics, as opposed to this legacy Unicode 8 path. This gets hit for
-    // e.g. non-contiguous NSStrings. In such cases, there may be an alternative
-    // CFString API available, or worst case we can map over it via UTextFuncs.
-
-    return legacyGraphemeForward(
-      start: start, end: end, startIndexUTF16: startIndexUTF16
-    )
-  }
-
-  @inline(never)
-  func legacyGraphemeForward(
-    start: UnicodeScalarView.Index,
-    end: UnicodeScalarView.Index,
-    startIndexUTF16: Int
-  ) -> Int {
-    var start = start
-    let graphemeClusterBreakProperty =
-      _UnicodeGraphemeClusterBreakPropertyTrie()
-    let segmenter = _UnicodeExtendedGraphemeClusterSegmenter()
-    
-    var gcb0 = graphemeClusterBreakProperty.getPropertyRawValue(
-      unicodeScalars[start].value)
-    unicodeScalars.formIndex(after: &start)
-    
-    while start != end {
-      // FIXME(performance): consider removing this "fast path".  A branch
-      // that is hard to predict could be worse for performance than a few
-      // loads from cache to fetch the property 'gcb1'.
-      if segmenter.isBoundaryAfter(gcb0) {
-        break
-      }
-      let gcb1 = graphemeClusterBreakProperty.getPropertyRawValue(
-        unicodeScalars[start].value)
-      if segmenter.isBoundary(gcb0, gcb1) {
-        break
-      }
-      gcb0 = gcb1
-      unicodeScalars.formIndex(after: &start)
+    // We have a non-contiguous string. Pull out some code units into a fixed
+    // array and try to perform grapheme breaking on that. If even that's not
+    // sufficient (i.e. very pathological) then copy into an Array.
+    var codeUnitBuffer = _FixedArray16<UInt16>(allZeros:())
+    let maxBufferCount = codeUnitBuffer.count
+    let bufferCount = Swift.min(maxBufferCount, numCodeUnits)
+    for i in 0..<bufferCount {
+      codeUnitBuffer[i] = _core[startOffset+i]
     }
-    
-    return start._position - startIndexUTF16
+
+    return withUnsafeBytes(of: &codeUnitBuffer.storage) {
+      (rawPtr : UnsafeRawBufferPointer) -> Int in
+      let bufPtr = UnsafeBufferPointer(
+        start: rawPtr.baseAddress!.assumingMemoryBound(to: UInt16.self),
+        count: bufferCount)
+
+      let breakIterator = _ThreadLocalStorage.getUBreakIterator(for: bufPtr)
+      let ubrkFollowingOffset = __swift_stdlib_ubrk_following(
+        breakIterator, Int32(0))
+
+      if _fastPath(
+        bufferCount < maxBufferCount ||
+        (ubrkFollowingOffset != -1 && ubrkFollowingOffset != maxBufferCount)
+      ) {
+        // The offset into our buffer *is* the distance.
+        _sanityCheck(ubrkFollowingOffset != 0, "zero-sized grapheme?")
+        return Int(ubrkFollowingOffset)
+      }
+
+      // Nuclear option: copy out the rest of the string into an array
+      var codeUnits = Array<UInt16>()
+      codeUnits.reserveCapacity(numCodeUnits)
+      for i in startOffset..<endOffset {
+        codeUnits.append(_core[i])
+      }
+      return codeUnits.withUnsafeBufferPointer { bufPtr -> Int in
+        let breakIterator = _ThreadLocalStorage.getUBreakIterator(for: bufPtr)
+        let ubrkFollowingOffset = __swift_stdlib_ubrk_following(
+          breakIterator, Int32(0)
+        )
+        // ubrk_following may return UBRK_DONE (-1). Treat that as the rest of
+        // the string.
+        if _slowPath(ubrkFollowingOffset == -1) {
+          return numCodeUnits
+        }
+        _sanityCheck(ubrkFollowingOffset != 0, "zero-sized grapheme?")
+        return Int(ubrkFollowingOffset)
+      }
+    }
   }
 
   // NOTE: Because this function is inlineable, it should contain only the fast
@@ -498,14 +501,25 @@
   internal func _measureExtendedGraphemeClusterBackward(
     from end: UnicodeScalarView.Index
   ) -> Int {
-    let start = unicodeScalars.startIndex
-    if start == end {
+    let startPosition = unicodeScalars.startIndex._position
+    let endPosition = end._position
+
+    // No more graphemes
+    if startPosition == endPosition {
       return 0
     }
 
-    // The relative position (offset) to the last code unit.
-    let lastOffset = end._position - _coreOffset - 1
-    // The relative position (offset) that is one-past-the-last
+    // Last code unit means final grapheme length of 1
+    if startPosition == endPosition - 1 {
+      return 1
+    }
+
+    // The relative offset from the _StringCore's baseAddress pointer for the
+    // one-past-the-end and the last code unit under consideration.  If our
+    // _core is not a substring, these are the same as positions. Otherwise,
+    // these are code unit relative offsets into the substring and not the
+    // absolute positions into the outer string.
+    let lastOffset = endPosition - _coreOffset - 1
     let endOffset = lastOffset + 1
 
     // Grapheme breaking is much simpler if known ASCII
@@ -513,15 +527,14 @@
       _onFastPath() // Please aggressively inline
       let asciiBuffer = _core.asciiBuffer._unsafelyUnwrappedUnchecked
       _sanityCheck(
-        lastOffset >= asciiBuffer.startIndex,
-        "should of been caught in earlier start-of-scalars check")
+        lastOffset-1 >= asciiBuffer.startIndex,
+        "should of been caught in earlier trivially-sized checks")
 
       // With the exception of CR-LF, ASCII graphemes are single-scalar. Check
       // for that one exception.
       if _slowPath(
-        asciiBuffer[lastOffset] == _LF &&
-        lastOffset-1 >= asciiBuffer.startIndex &&
-        asciiBuffer[lastOffset-1] == _CR
+        asciiBuffer[lastOffset-1] == _CR &&
+        asciiBuffer[lastOffset] == _LF
       ) {
         return 2
       }
@@ -529,92 +542,94 @@
       return 1
     }
     
-    let endIndexUTF16 = end._position
-
-    // First scalar is its own grapheme
-    if (endIndexUTF16-1 == start._position) {
+    // Perform a quick single-code-unit grapheme check
+    if _fastPath(String.CharacterView._quickCheckGraphemeBreakBetween(
+      _core[lastOffset-1], _core[lastOffset])
+    ) {
       return 1
     }
 
-    // Perform a quick single-code-unit grapheme check
-    if _core._baseAddress != nil {
-      if String.CharacterView._quickCheckGraphemeBreakBetween(
-        _core._nthContiguous(lastOffset-1),
-        _core._nthContiguous(lastOffset)
-      ) {
-        return 1
-      }
-    }
-    return _measureExtendedGraphemeClusterBackwardSlow(
-      endOffset: endOffset, start: start, end: end, endIndexUTF16: endIndexUTF16
-    )
+    return _measureExtendedGraphemeClusterBackwardSlow(endOffset: endOffset)
   }
   
   @inline(never)
   @_versioned
   func _measureExtendedGraphemeClusterBackwardSlow(
-    endOffset: Int,
-    start: String.UnicodeScalarView.Index,
-    end: String.UnicodeScalarView.Index,
-    endIndexUTF16: Int
+    endOffset: Int
   ) -> Int {
-    if _core._baseAddress != nil {
-      _onFastPath() // Please aggressively inline
-      let breakIterator = _ThreadLocalStorage.getUBreakIterator(for: _core)
-      let ubrkPreceding = __swift_stdlib_ubrk_preceding(
-        breakIterator, Int32(endOffset)
-      )
+    let startOffset = 0
+    let numCodeUnits = endOffset - startOffset
+    _sanityCheck(unicodeScalars.startIndex._position - _coreOffset == 0,
+      "position/offset mismatch in _StringCore as a substring")
+    _sanityCheck(numCodeUnits >= 2,
+      "should have at least two code units")
+
+    func measureFromUBreakOffset(_ ubrkOffset: Int32) -> Int {
       // ubrk_following may return UBRK_DONE (-1). Treat that as the rest of the
       // string.
-      let priorPosition =
-        ubrkPreceding == -1 ? start._position : Int(ubrkPreceding)
-      return endOffset - priorPosition
-    } else {
-      // TODO: See if we can get fast character contents.
-    }
-
-    // FIXME: Need to handle the general case correctly with Unicode 9+
-    // semantics, as opposed to this legacy Unicode 8 path. This gets hit for
-    // e.g. non-contiguous NSStrings. In such cases, there may be an alternative
-    // CFString API available, or worst case we can map over it via UTextFuncs.
-
-    return legacyGraphemeBackward(
-      start: start, end: end, endIndexUTF16: endIndexUTF16
-    )
-  }
-
-  @inline(never)
-  func legacyGraphemeBackward(
-    start: UnicodeScalarView.Index,
-    end: UnicodeScalarView.Index,
-    endIndexUTF16: Int
-  ) -> Int {
-    let graphemeClusterBreakProperty =
-      _UnicodeGraphemeClusterBreakPropertyTrie()
-    let segmenter = _UnicodeExtendedGraphemeClusterSegmenter()
-    
-    var graphemeClusterStart = end
-    
-    unicodeScalars.formIndex(before: &graphemeClusterStart)
-    var gcb0 = graphemeClusterBreakProperty.getPropertyRawValue(
-      unicodeScalars[graphemeClusterStart].value)
-    
-    var graphemeClusterStartUTF16 = graphemeClusterStart._position
-    
-    while graphemeClusterStart != start {
-      unicodeScalars.formIndex(before: &graphemeClusterStart)
-      let gcb1 = graphemeClusterBreakProperty.getPropertyRawValue(
-        unicodeScalars[graphemeClusterStart].value)
-      if segmenter.isBoundary(gcb1, gcb0) {
-        break
+      if _slowPath(ubrkOffset == -1) {
+        return numCodeUnits
       }
-      gcb0 = gcb1
-      graphemeClusterStartUTF16 = graphemeClusterStart._position
+      _sanityCheck(endOffset > Int(ubrkOffset), "zero-sized grapheme?")
+      return endOffset - Int(ubrkOffset)
     }
-    
-    return endIndexUTF16 - graphemeClusterStartUTF16
+
+    // The vast majority of time, we can get a pointer and a length directly
+    if _fastPath(_core._baseAddress != nil) {
+      _onFastPath() // Please aggressively inline
+      let breakIterator = _ThreadLocalStorage.getUBreakIterator(for: _core)
+      let ubrkPrecedingOffset = __swift_stdlib_ubrk_preceding(
+        breakIterator, Int32(endOffset)
+      )
+      return measureFromUBreakOffset(ubrkPrecedingOffset)
+    }
+
+    // We have a non-contiguous string. Pull out some code units into a fixed
+    // array and try to perform grapheme breaking on that. If even that's not
+    // sufficient (i.e. very pathological) then copy into an Array.
+    var codeUnitBuffer = _FixedArray16<UInt16>(allZeros:())
+    let maxBufferCount = codeUnitBuffer.count
+    let coreStartIdx = Swift.max(startOffset, endOffset - maxBufferCount)
+    let bufferCount = Swift.min(maxBufferCount, numCodeUnits)
+    for i in 0..<bufferCount {
+      codeUnitBuffer[i] = _core[coreStartIdx+i]
+    }
+
+    return withUnsafeBytes(of: &codeUnitBuffer.storage) {
+      (rawPtr : UnsafeRawBufferPointer) -> Int in
+      let bufPtr = UnsafeBufferPointer(
+        start: rawPtr.baseAddress!.assumingMemoryBound(to: UInt16.self),
+        count: bufferCount)
+
+      let breakIterator = _ThreadLocalStorage.getUBreakIterator(for: bufPtr)
+      let ubrkPrecedingOffset = __swift_stdlib_ubrk_preceding(
+        breakIterator, Int32(bufferCount)
+      )
+
+      if _fastPath(numCodeUnits < maxBufferCount || ubrkPrecedingOffset > 1) {
+        // There was a grapheme break within our buffer.
+        _sanityCheck(ubrkPrecedingOffset < bufferCount, "offset mismatch")
+        return bufferCount - Int(ubrkPrecedingOffset)
+      }
+
+      // Nuclear option: copy out the prefix of the string into an array
+      var codeUnits = Array<UInt16>()
+      codeUnits.reserveCapacity(numCodeUnits)
+      for i in startOffset..<endOffset {
+        codeUnits.append(_core[i])
+      }
+      return codeUnits.withUnsafeBufferPointer { bufPtr -> Int in
+        let breakIterator = _ThreadLocalStorage.getUBreakIterator(for: bufPtr)
+        let ubrkPrecedingOffset = __swift_stdlib_ubrk_preceding(
+          breakIterator, Int32(endOffset)
+        )
+        // No need to adjust ubrkPrecedingOffset as we copied the prefix: it is
+        // the position in the original string
+        return measureFromUBreakOffset(ubrkPrecedingOffset)
+      }
+    }    
   }
-  
+
   /// Accesses the character at the given position.
   ///
   /// The following example searches a string's character view for a capital

diff --git a/stdlib/public/core/Substring.swift.gyb b/stdlib/public/core/Substring.swift.gyb
index 6add01c..8bb704f 100644
--- a/stdlib/public/core/Substring.swift.gyb
+++ b/stdlib/public/core/Substring.swift.gyb

@@ -283,8 +283,11 @@
     let wholeCore = _slice._base._core
     let subCore : _StringCore = wholeCore[
       startIndex._base._position..<endIndex._base._position]
-    // check that we haven't allocated a new buffer for the result
-    _sanityCheck(subCore._owner === wholeCore._owner)
+    // Check that we haven't allocated a new buffer for the result, if we have
+    // contiguous storage.
+    _sanityCheck(
+      subCore._owner === wholeCore._owner || !wholeCore.hasContiguousStorage)
+
     return String(subCore)
   }
 }

diff --git a/stdlib/public/core/ThreadLocalStorage.swift b/stdlib/public/core/ThreadLocalStorage.swift
index cb213d5..2c09ac3 100644
--- a/stdlib/public/core/ThreadLocalStorage.swift
+++ b/stdlib/public/core/ThreadLocalStorage.swift

@@ -74,20 +74,25 @@
   }
 
   // Retrieve our thread's local uBreakIterator and set it up for the given
-  // StringCore. Checks our TLS cache to avoid excess text resetting.
+  // StringCore.
   static internal func getUBreakIterator(
     for core: _StringCore
   ) -> OpaquePointer {
+    _sanityCheck(core._owner != nil || core._baseAddress != nil,
+      "invalid StringCore")
+    let corePtr: UnsafeMutablePointer<UTF16.CodeUnit> = core.startUTF16
+    return getUBreakIterator(
+      for: UnsafeBufferPointer(start: corePtr, count: core.count))
+  }
+  static internal func getUBreakIterator(
+    for bufPtr: UnsafeBufferPointer<UTF16.CodeUnit>
+  ) -> OpaquePointer {
     let tlsPtr = getPointer()
     let brkIter = tlsPtr[0].uBreakIterator
 
-    _sanityCheck(core._owner != nil || core._baseAddress != nil,
-      "invalid StringCore")
-
     var err = __swift_stdlib_U_ZERO_ERROR
-    let corePtr: UnsafeMutablePointer<UTF16.CodeUnit>
-    corePtr = core.startUTF16
-    __swift_stdlib_ubrk_setText(brkIter, corePtr, Int32(core.count), &err)
+    __swift_stdlib_ubrk_setText(
+      brkIter, bufPtr.baseAddress!, Int32(bufPtr.count), &err)
     _precondition(err.isSuccess, "unexpected ubrk_setUText failure")
 
     return brkIter

diff --git a/test/stdlib/Inputs/NSSlowString/NSSlowString.h b/test/stdlib/Inputs/NSSlowString/NSSlowString.h
new file mode 100644
index 0000000..548939c
--- /dev/null
+++ b/test/stdlib/Inputs/NSSlowString/NSSlowString.h

@@ -0,0 +1,10 @@
+#import <Foundation/NSString.h>
+
+// An NSString whose _fastCharacterContents always returns nil
+@interface NSSlowString : NSString
+
+@property (nonatomic, strong) id myProperty;
+
+- (void *) _fastCharacterContents;
+
+@end

diff --git a/test/stdlib/Inputs/NSSlowString/NSSlowString.m b/test/stdlib/Inputs/NSSlowString/NSSlowString.m
new file mode 100644
index 0000000..5f38447
--- /dev/null
+++ b/test/stdlib/Inputs/NSSlowString/NSSlowString.m

@@ -0,0 +1,37 @@
+#import "NSSlowString.h"
+
+
+@interface NSSlowString ()
+
+@property (nonatomic, strong) NSString *stringHolder;
+
+@end
+
+@implementation NSSlowString
+
+- (instancetype)initWithString:(NSString *)name {
+	self = [super init];
+	if (self == nil) {
+		return nil;
+	}
+	self.stringHolder = name;
+	return self;
+}
+
+- (NSUInteger)length {
+    return self.stringHolder.length;
+}
+
+- (id)copy {
+	return self;
+}
+
+- (unichar)characterAtIndex:(NSUInteger)index {
+    return [self.stringHolder characterAtIndex:index];
+}
+
+- (void *) _fastCharacterContents {
+  return nil;
+}
+
+@end
\ No newline at end of file

diff --git a/test/stdlib/Inputs/NSSlowString/module.map b/test/stdlib/Inputs/NSSlowString/module.map
new file mode 100644
index 0000000..60ce0d3
--- /dev/null
+++ b/test/stdlib/Inputs/NSSlowString/module.map

@@ -0,0 +1,3 @@
+module NSSlowString {
+  header "NSSlowString.h"
+}

diff --git a/test/stdlib/NSSlowString.swift b/test/stdlib/NSSlowString.swift
new file mode 100644
index 0000000..5c649bd
--- /dev/null
+++ b/test/stdlib/NSSlowString.swift

@@ -0,0 +1,79 @@
+// RUN: mkdir -p %t
+// RUN: %target-clang -fobjc-arc %S/Inputs/NSSlowString/NSSlowString.m -c -o %t/NSSlowString.o
+// RUN: %target-build-swift -I %S/Inputs/NSSlowString/ %t/NSSlowString.o %s -o %t/a.out
+// RUN: %target-run %t/a.out
+
+// REQUIRES: executable_test
+// REQUIRES: objc_interop
+
+import Foundation
+import NSSlowString
+import Swift
+
+import StdlibUnittest
+
+let tests = TestSuite("NonContiguousStrings")
+
+// Perform expected test checks
+func checkSingleForm<S: StringProtocol>(
+	_ s: S, expectedCount: Int, expectedCodeUnitCount: Int?
+) {
+	expectEqual(expectedCount, Int(s.count))
+	if let cuCount = expectedCodeUnitCount {
+		expectEqual(cuCount, Int(s.utf16.count))
+	}
+
+	// Now check various reversed properties
+	let reversedCharacters = Array<Character>(s.reversed())
+
+	expectEqual(Int(s.count), reversedCharacters.count)
+	expectEqualSequence(s.reversed(), reversedCharacters)
+	expectEqual(String(s), String(reversedCharacters.reversed()))
+}
+func check(
+	_ s: String, expectedCount count: Int, expectedCodeUnitCount cuCount: Int
+) {
+	checkSingleForm(s, expectedCount: count, expectedCodeUnitCount: cuCount)
+
+	// Substring tests
+	checkSingleForm(s[...], expectedCount: count, expectedCodeUnitCount: cuCount)
+	checkSingleForm(s.dropFirst(), expectedCount: count-1, expectedCodeUnitCount: nil)
+	checkSingleForm(s.dropLast(), expectedCount: count-1, expectedCodeUnitCount: nil)
+	checkSingleForm(s.dropLast().dropFirst(), expectedCount: count-2, expectedCodeUnitCount: nil)
+}
+
+tests.test("Unicode 9 grapheme breaking") {
+
+	// Test string lengths that correspond to smaller than our fixed size code
+	// unit buffer, larger than it, and exactly it.
+	let strSmall = NSSlowString(string: "a👍👩‍👩‍👧‍👦")
+	let strBig = NSSlowString(string: "abcdefg👍👩‍👩‍👧‍👦")
+	let strJustRight = NSSlowString(string: "abc👍👩‍👩‍👧‍👦")
+	check(strSmall as String, expectedCount: 3, expectedCodeUnitCount: 14)
+	check(strBig as String, expectedCount: 9, expectedCodeUnitCount: 20)
+	check(strJustRight as String, expectedCount: 5, expectedCodeUnitCount: 16)
+}
+
+tests.test("Zalgo") {
+	// Check that we handle absurdly long graphemes
+	var zalgo = "a👩‍👩‍👧‍👦c"
+	for combo in 0x300...0x36f {
+		zalgo.append(String(UnicodeScalar(combo)!))
+	}
+	check(
+		NSSlowString(string: zalgo) as String, 
+		expectedCount: 3, 
+		expectedCodeUnitCount: 125
+	)
+
+	// Check for interspersed zalgo and emoji
+	var megaZalgo = zalgo + zalgo + zalgo + zalgo
+	check(
+		NSSlowString(string: megaZalgo) as String,
+		expectedCount: megaZalgo.count,
+		expectedCodeUnitCount: megaZalgo.utf16.count
+	)
+}
+
+runAllTests()
+

diff --git a/test/stdlib/NewString.swift b/test/stdlib/NewString.swift
index ec4c586..da620a5 100644
--- a/test/stdlib/NewString.swift
+++ b/test/stdlib/NewString.swift

@@ -129,6 +129,8 @@
   var nsASCII = NSString(utf8String: "foobar")!
   // CHECK-NEXT: has UTF-16: false
   print("has UTF-16: \(CFStringGetCharactersPtr(unsafeBitCast(nsASCII, to: CFString.self)) != nil)")
+  print("has ASCII pointer: \(CFStringGetCStringPtr(unsafeBitCast(nsASCII, to: CFString.self), 0x0600) != nil)")
+  print("has ASCII pointer: \(CFStringGetCStringPtr(unsafeBitCast(nsASCII, to: CFString.self), 0x08000100) != nil)")
 
   // CHECK: --- ASCII basic round-tripping ---
   print("--- ASCII basic round-tripping ---")
commit	091738cf9758bed3bdecaac50a8af45b76546a3d	[log] [tgz]
author	Michael Ilseman <michael.ilseman@gmail.com>	Fri Jun 30 14:13:39 2017 -0700
committer	GitHub <noreply@github.com>	Fri Jun 30 14:13:39 2017 -0700
tree	ab1ac27e7c23223b44ce73c492086b84ff6c2103
parent	2eebb1a8e1bc898ec02d8e695c9f58607c6f0ac7 [diff]
parent	b6ac8c61abe05b0bba9a6fc793ef07fd282bd7cb [diff]