Merge pull request #22614 from linux-on-ibm-z/s390x-smallstring-5.0-backport
[5.0] [string] Fix string implementation for big endian platforms
diff --git a/stdlib/public/core/SmallString.swift b/stdlib/public/core/SmallString.swift
index af51dea..9d4570c 100644
--- a/stdlib/public/core/SmallString.swift
+++ b/stdlib/public/core/SmallString.swift
@@ -10,6 +10,19 @@
//
//===----------------------------------------------------------------------===//
+// The code units in _SmallString are always stored in memory in the same order
+// that they would be stored in an array. This means that on big-endian
+// platforms the order of the bytes in storage is reversed compared to
+// _StringObject whereas on little-endian platforms the order is the same.
+//
+// Memory layout:
+//
+// |0 1 2 3 4 5 6 7 8 9 A B C D E F| ← hexadecimal offset in bytes
+// | _storage.0 | _storage.1 | ← raw bits
+// | code units | | ← encoded layout
+// ↑ ↑
+// first (leftmost) code unit discriminator (incl. count)
+//
@_fixed_layout @usableFromInline
internal struct _SmallString {
@usableFromInline
@@ -50,16 +63,18 @@
@inlinable @inline(__always)
internal init(_ object: _StringObject) {
_internalInvariant(object.isSmall)
- self.init(raw: object.rawBits)
+ // On big-endian platforms the byte order is the reverse of _StringObject.
+ let leading = object.rawBits.0.littleEndian
+ let trailing = object.rawBits.1.littleEndian
+ self.init(raw: (leading, trailing))
}
@inlinable @inline(__always)
internal init() {
- self.init(raw: _StringObject(empty:()).rawBits)
+ self.init(_StringObject(empty:()))
}
}
-// TODO
extension _SmallString {
@inlinable
internal static var capacity: Int {
@@ -72,9 +87,12 @@
}
}
+ // Get an integer equivalent to the _StringObject.discriminatedObjectRawBits
+ // computed property.
@inlinable @inline(__always)
internal var rawDiscriminatedObject: UInt64 {
- return _storage.1
+ // Reverse the bytes on big-endian systems.
+ return _storage.1.littleEndian
}
@inlinable
@@ -107,7 +125,7 @@
// usage: it always clears the discriminator and count (in case it's full)
@inlinable @inline(__always)
internal var zeroTerminatedRawCodeUnits: RawBitPattern {
- let smallStringCodeUnitMask: UInt64 = 0x00FF_FFFF_FFFF_FFFF
+ let smallStringCodeUnitMask = ~UInt64(0xFF).bigEndian // zero last byte
return (self._storage.0, self._storage.1 & smallStringCodeUnitMask)
}
@@ -231,11 +249,12 @@
_internalInvariant(count <= _SmallString.capacity)
let isASCII = (leading | trailing) & 0x8080_8080_8080_8080 == 0
- let countAndDiscriminator = UInt64(truncatingIfNeeded: count) &<< 56
- | _StringObject.Nibbles.small(isASCII: isASCII)
- _internalInvariant(trailing & countAndDiscriminator == 0)
+ let discriminator = _StringObject.Nibbles
+ .small(withCount: count, isASCII: isASCII)
+ .littleEndian // reversed byte order on big-endian platforms
+ _internalInvariant(trailing & discriminator == 0)
- self.init(raw: (leading, trailing | countAndDiscriminator))
+ self.init(raw: (leading, trailing | discriminator))
_internalInvariant(self.count == count)
}
@@ -300,23 +319,31 @@
#endif
extension UInt64 {
- // Fetches the `i`th byte, from least-significant to most-significant
- //
- // TODO: endianess awareness day
+ // Fetches the `i`th byte in memory order. On little-endian systems the byte
+ // at i=0 is the least significant byte (LSB) while on big-endian systems the
+ // byte at i=7 is the LSB.
@inlinable @inline(__always)
internal func _uncheckedGetByte(at i: Int) -> UInt8 {
_internalInvariant(i >= 0 && i < MemoryLayout<UInt64>.stride)
+#if _endian(big)
+ let shift = (7 - UInt64(truncatingIfNeeded: i)) &* 8
+#else
let shift = UInt64(truncatingIfNeeded: i) &* 8
+#endif
return UInt8(truncatingIfNeeded: (self &>> shift))
}
- // Sets the `i`th byte, from least-significant to most-significant
- //
- // TODO: endianess awareness day
+ // Sets the `i`th byte in memory order. On little-endian systems the byte
+ // at i=0 is the least significant byte (LSB) while on big-endian systems the
+ // byte at i=7 is the LSB.
@inlinable @inline(__always)
internal mutating func _uncheckedSetByte(at i: Int, to value: UInt8) {
_internalInvariant(i >= 0 && i < MemoryLayout<UInt64>.stride)
+#if _endian(big)
+ let shift = (7 - UInt64(truncatingIfNeeded: i)) &* 8
+#else
let shift = UInt64(truncatingIfNeeded: i) &* 8
+#endif
let valueMask: UInt64 = 0xFF &<< shift
self = (self & ~valueMask) | (UInt64(truncatingIfNeeded: value) &<< shift)
}
@@ -336,5 +363,6 @@
r = r | (UInt64(input[idx]) &<< shift)
shift = shift &+ 8
}
- return r
+ // Convert from little-endian to host byte order.
+ return r.littleEndian
}
diff --git a/stdlib/public/core/StringObject.swift b/stdlib/public/core/StringObject.swift
index 5bc877f..3c785f4 100644
--- a/stdlib/public/core/StringObject.swift
+++ b/stdlib/public/core/StringObject.swift
@@ -516,11 +516,15 @@
*/
extension _StringObject {
-#if arch(i386) || arch(arm)
@inlinable @inline(__always)
internal init(_ small: _SmallString) {
+ // Small strings are encoded as _StringObjects in reverse byte order
+ // on big-endian platforms. This is to match the discriminator to the
+ // spare bits (the most significant nibble) in a pointer.
+ let word1 = small.rawBits.0.littleEndian
+ let word2 = small.rawBits.1.littleEndian
+#if arch(i386) || arch(arm)
// On 32-bit, we need to unpack the small string.
- let (word1, word2) = small.rawBits
let smallStringDiscriminatorAndCount: UInt64 = 0xFF00_0000_0000_0000
let leadingFour = Int(truncatingIfNeeded: word1)
@@ -532,15 +536,12 @@
variant: .immortal(nextFour),
discriminator: smallDiscriminatorAndCount,
flags: trailingTwo)
- _internalInvariant(isSmall)
- }
#else
- @inlinable @inline(__always)
- internal init(_ small: _SmallString) {
- self.init(rawValue: small.rawBits)
+ // On 64-bit, we copy the raw bits (to host byte order).
+ self.init(rawValue: (word1, word2))
+#endif
_internalInvariant(isSmall)
}
-#endif
@inlinable
internal static func getSmallCount(fromRaw x: UInt64) -> Int {