Parallel Merge Sort in Scala - multithreading

I have been trying to implement parallel merge sort in Scala. But with 8 cores, using .sorted is still about twice as fast.
edit:
I rewrote most of the code to minimize object creation. Now it runs about as fast as the .sorted
Input file with 1.2M integers:
1.333580 seconds (my implementation)
1.439293 seconds (.sorted)
How should I parallelize this?
New implementation
object Mergesort extends App
{
//=====================================================================================================================
// UTILITY
implicit object comp extends Ordering[Any] {
def compare(a: Any, b: Any) = {
(a, b) match {
case (a: Int, b: Int) => a compare b
case (a: String, b: String) => a compare b
case _ => 0
}
}
}
//=====================================================================================================================
// MERGESORT
val THRESHOLD = 30
def inssort[A](a: Array[A], left: Int, right: Int): Array[A] = {
for (i <- (left+1) until right) {
var j = i
val item = a(j)
while (j > left && comp.lt(item,a(j-1))) {
a(j) = a(j-1)
j -= 1
}
a(j) = item
}
a
}
def mergesort_merge[A](a: Array[A], temp: Array[A], left: Int, right: Int, mid: Int) : Array[A] = {
var i = left
var j = right
while (i < mid) { temp(i) = a(i); i+=1; }
while (j > mid) { temp(i) = a(j-1); i+=1; j-=1; }
i = left
j = right-1
var k = left
while (k < right) {
if (comp.lt(temp(i), temp(j))) { a(k) = temp(i); i+=1; k+=1; }
else { a(k) = temp(j); j-=1; k+=1; }
}
a
}
def mergesort_split[A](a: Array[A], temp: Array[A], left: Int, right: Int): Array[A] = {
if (right-left == 1) a
if ((right-left) > THRESHOLD) {
val mid = (left+right)/2
mergesort_split(a, temp, left, mid)
mergesort_split(a, temp, mid, right)
mergesort_merge(a, temp, left, right, mid)
}
else
inssort(a, left, right)
}
def mergesort[A: ClassTag](a: Array[A]): Array[A] = {
val temp = new Array[A](a.size)
mergesort_split(a, temp, 0, a.size)
}
Previous implementation
Input file with 1.2M integers:
4.269937 seconds (my implementation)
1.831767 seconds (.sorted)
What sort of tricks there are to make it faster and cleaner?
object Mergesort extends App
{
//=====================================================================================================================
// UTILITY
val StartNano = System.nanoTime
def dbg(msg: String) = println("%05d DBG ".format(((System.nanoTime - StartNano)/1e6).toInt) + msg)
def time[T](work: =>T) = {
val start = System.nanoTime
val res = work
println("%f seconds".format((System.nanoTime - start)/1e9))
res
}
implicit object comp extends Ordering[Any] {
def compare(a: Any, b: Any) = {
(a, b) match {
case (a: Int, b: Int) => a compare b
case (a: String, b: String) => a compare b
case _ => 0
}
}
}
//=====================================================================================================================
// MERGESORT
def merge[A](left: List[A], right: List[A]): Stream[A] = (left, right) match {
case (x :: xs, y :: ys) if comp.lteq(x, y) => x #:: merge(xs, right)
case (x :: xs, y :: ys) => y #:: merge(left, ys)
case _ => if (left.isEmpty) right.toStream else left.toStream
}
def sort[A](input: List[A], length: Int): List[A] = {
if (length < 100) return input.sortWith(comp.lt)
input match {
case Nil | List(_) => input
case _ =>
val middle = length / 2
val (left, right) = input splitAt middle
merge(sort(left, middle), sort(right, middle + length%2)).toList
}
}
def msort[A](input: List[A]): List[A] = sort(input, input.length)
//=====================================================================================================================
// PARALLELIZATION
//val cores = Runtime.getRuntime.availableProcessors
//dbg("Detected %d cores.".format(cores))
//lazy implicit val ec = ExecutionContext.fromExecutorService(Executors.newFixedThreadPool(cores))
def futuremerge[A](fa: Future[List[A]], fb: Future[List[A]])(implicit order: Ordering[A], ec: ExecutionContext) =
{
for {
a <- fa
b <- fb
} yield merge(a, b).toList
}
def parallel_msort[A](input: List[A], length: Int)(implicit order: Ordering[A]): Future[List[A]] = {
val middle = length / 2
val (left, right) = input splitAt middle
if(length > 500) {
val fl = parallel_msort(left, middle)
val fr = parallel_msort(right, middle + length%2)
futuremerge(fl, fr)
}
else {
Future(msort(input))
}
}
//=====================================================================================================================
// MAIN
val results = time({
val src = Source.fromFile("in.txt").getLines
val header = src.next.split(" ").toVector
val lines = if (header(0) == "i") src.map(_.toInt).toList else src.toList
val f = parallel_msort(lines, lines.length)
Await.result(f, concurrent.duration.Duration.Inf)
})
println("Sorted as comparison...")
val sorted_src = Source.fromFile(input_folder+"in.txt").getLines
sorted_src.next
time(sorted_src.toList.sorted)
val writer = new PrintWriter("out.txt", "UTF-8")
try writer.print(results.mkString("\n"))
finally writer.close
}

My answer is probably going to be a bit long, but i hope that it will be useful for both you and me.
So, first question is: "how scala is doing sorting for a List?" Let's have a look at the code from scala repo!
def sorted[B >: A](implicit ord: Ordering[B]): Repr = {
val len = this.length
val b = newBuilder
if (len == 1) b ++= this
else if (len > 1) {
b.sizeHint(len)
val arr = new Array[AnyRef](len) // Previously used ArraySeq for more compact but slower code
var i = 0
for (x <- this) {
arr(i) = x.asInstanceOf[AnyRef]
i += 1
}
java.util.Arrays.sort(arr, ord.asInstanceOf[Ordering[Object]])
i = 0
while (i < arr.length) {
b += arr(i).asInstanceOf[A]
i += 1
}
}
b.result()
}
So what the hell is going on here? Long story short: with java. Everything else is just size justification and casting. Basically this is the line which defines it:
java.util.Arrays.sort(arr, ord.asInstanceOf[Ordering[Object]])
Let's go one level deeper into JDK sources:
public static <T> void sort(T[] a, Comparator<? super T> c) {
if (c == null) {
sort(a);
} else {
if (LegacyMergeSort.userRequested)
legacyMergeSort(a, c);
else
TimSort.sort(a, 0, a.length, c, null, 0, 0);
}
}
legacyMergeSort is nothing but single threaded implementation of merge sort algorithm.
The next question is: "what is TimSort.sort and when do we use it?"
To my best knowledge default value for this property is false, which leads us to TimSort.sort algorithm. Description can be found here. Why is it better? Less comparisons that in merge sort according to comments in JDK sources.
Moreover you should be aware that it is all single threaded, so no parallelization here.
Third question, "your code":
You create too many objects. When it comes to performance, mutation (sadly) is your friend.
Premature optimization is the root of all evil -- Donald Knuth. Before making any optimizations (like parallelism), try to implement single threaded version and compare the results.
Use something like JMH to test performance of your code.
You should not probably use Stream class if you want to have the best performance as it does additional caching.
I intentionally did not give you answer like "super-fast merge sort in scala can be found here", but just some tips for you to apply to your code and coding practices.
Hope it will help you.

Related

Scala string interpolation with a pass-by-name string

I'd like to pass a default string to a function and have "string interpolation" done on it in the function rather than at the call site.
For example,
def isBetween(a:Int, b:Int,
msg: String = s"${v} is not between ${a} and ${b}."
)(v:Int):Either[String, Boolean] = {
if (a <= v && v <= b) Right(true) else Left(msg)
}
This doesn't compile because none of a, b, and for sure not v are in scope when the compiler wants to do the interpolation.
The goal is to provide a default error string but allow the user to change it, if necessary. For example:
val normalBetween = isBetween(0, 100)
val customBetween = isBetween(0, 100, s"Doofus! it's gotta be ${a} <= v <= ${b} but v is ${v}!")
val result1 = normalBetween(101) // Left("101 is not between 0 and 100.")
val result2 = customBetween(101) // Left("Doofus! it's gotta be 0 <= v <= 100 but v is 101!")
I tried making msg pass-by-name; no luck.
I suppose I want something like this from the Python world:
name = 'world'
program ='python'
print('Hello {name}!This is{program}.'.format(name=name, program=program))
Any suggestions?
As #LuisMiguelMejíaSuárez suggested in the comment, you can just use java's string formatting:
def isBetween(a: Int, b: Int, msg: String = "%%d is not between %d and %d.")(v: Int): Either[String, Boolean] = {
if (a <= v && v <= b) Right(true) else Left(msg.format(a, b).format(v))
}
def normalBetween: Int => Either[String, Boolean] = isBetween(0, 100)
def customBetween: Int => Either[String, Boolean] = isBetween(0, 100, "Doofus! it's gotta be %d <= v <= %d but v is %%d!")
val result1 = normalBetween(101) // Left("101 is not between 0 and 100.")
val result2 = customBetween(101) // Left("Doofus! it's gotta be 0 <= v <= 100 but v is 101!")
println(result1)
println(result2)
The result will be as expected. Code run at Scastie. If you are taking this approach, and your scenario inn reality is more complex than the given example, you can use named parameters in this string. More can be read about it at Named placeholders in string formatting, How to format message with argument names instead of numbers?, and many more articles.
It's not possible to refer to a variable declared in the same (or a future) parameter list, however you can refer to a variable declared in a previous parameter list, like so:
def isBetween(
a:Int, b:Int
)(v: Int)(
msg: String = s"${v} is not between ${a} and ${b}."
): Either[String, Boolean] = {
if (a <= v && v <= b) Right(true) else Left(msg)
}
If you'd like to be able to offer callers the ability to provide a custom template string, you can do so as follows:
def isBetween(
a:Int, b:Int
)(v: Int)(
msg: (Int, Int, Int) => String =
(pA, pB, pV) => s"${pV} is not between ${pA} and ${pB}."
): Either[String, Boolean] = {
if (a <= v && v <= b) Right(true) else Left(msg(a, b, v)
}
Example usage:
val customMsg = (a: Int, b: Int, v: Int) => s"Sorry but $v is not between $a and $b!"
isBetween(5, 7)(6)(customMsg)
If you'd like to offer callers a completely "custom" isBetween, then you can do so by putting the message in the first parameter group:
def isBetween(
msg: (Int, Int, Int) => String =
(pA, pB, pV) => s"${pV} is not between ${pA} and ${pB}."
)(
a:Int, b:Int
)(v: Int): Either[String, Boolean] = {
if (a <= v && v <= b) Right(true) else Left(msg(a, b, v))
}
val customMsg = (a: Int, b: Int, v: Int) => s"Sorry but $v is not between $a and $b!"
val customMsgIsBetween = isBetween(customMsg) _
customMsgIsBetween(5, 7)(6)
It's worth remembering that we can use sentinel values for this. While null is discouraged in Scala for passing data around, it is still allowed, and for a temporary local use, it's fairly harmless as long as we don't let it escape scope.
def isBetween(a: Int, b: Int, msgArg: String = null)(v: Int): Either[String, Boolean] = {
val msg = if (msgArg == null) {
s"${v} is not between ${a} and ${b}.";
} else {
msgArg
}
if (a <= v && v <= b) {
Right(true)
} else {
Left(msg)
}
}

How to handle optional db step in slick 3?

I'm sure I'm simply facing a mental block with the functional model of Slick 3, but I cannot discern how to transactionally sequence an optional dependent db step in Slick 3. Specifically, I have a table with an optional (nullable) foreign key and I want it to be set to the ID of the inserted dependent record (if any, else null). That is, roughly:
if ( x is non null )
start transaction
id = insert x
insert y(x = id)
commit
else
start transaction
insert y(x = null)
commit
Of course, I'd rather not have the big if around the choice. Dependencies without the Option[] seem (relatively) straightforward, but the option is throwing me.
Precise example code (sans imports) follows. In this example, the question is how to save both x (a) and y (b) in the same transaction both if y is None or not. Saving Y itself seems straightforward enough as every related C has a non-optional B reference, but addressing the optional reference in A is unclear (to me).
object test {
implicit val db = Database.forURL("jdbc:h2:mem:DataTableTypesTest;DB_CLOSE_DELAY=-1", driver = "org.h2.Driver")
/* Data model */
case class A(id: Long, b: Option[Long], s: String)
class As(tag: Tag) extends Table[A](tag, "As") {
def id = column[Long]("ID", O.PrimaryKey, O.AutoInc)
def b = column[Option[Long]]("B")
def s = column[String]("S")
def * = (id, b, s) <> (A.tupled, A.unapply)
}
val as = TableQuery[As]
case class B(id: Long, s: String)
class Bs(tag: Tag) extends Table[B](tag, "Bs") {
def id = column[Long]("ID", O.PrimaryKey, O.AutoInc)
def s = column[String]("S")
def * = (id, s) <> (B.tupled, B.unapply)
}
val bs = TableQuery[Bs]
case class C(id: Long, b: Long, s: String)
class Cs(tag: Tag) extends Table[C](tag, "Cs") {
def id = column[Long]("ID", O.PrimaryKey, O.AutoInc)
def b = column[Long]("B")
def s = column[String]("S")
def * = (id, b, s) <> (C.tupled, C.unapply)
}
val cs = TableQuery[Cs]
/* Object model */
case class X(id: Long, s: String, y: Option[Y])
case class Y(id: Long, s: String, z: Set[Z])
case class Z(id: Long, s: String)
/* Mappers */
def xToA(x: X, bId: Option[Long]): A = { A(x.id, bId, x.s) }
def yToB(y: Y): B = { B(y.id, y.s) }
def zToC(z: Z, bId: Long): C = { C(z.id, bId, z.s) }
/* Given */
val example1 = X(0, "X1", Some(Y(0, "Y1", Set(Z(0, "Z11"), Z(0, "Z12")))))
val example2 = X(0, "X2", Some(Y(0, "Y2", Set())))
val example3 = X(0, "X3", None)
Await.result(db.run((as.schema ++ bs.schema ++ cs.schema).create), 10.seconds)
val examples = Seq(example1, example2, example3)
for ( example <- examples ) {
val saveY = (for { y <- example.y }
yield ( for {
id <- (bs returning bs.map(_.id)) += yToB(y)
_ <- cs ++= y.z.map(zToC(_, id))
} yield id) transactionally)
if ( saveY.isDefined ) Await.result(db.run(saveY.get), 10.seconds)
}
println(Await.result(
db.run(
(for { a <- as } yield a).result
),
10.seconds
))
println(Await.result(
db.run(
(for { b <- bs } yield b).result
),
10.seconds
))
println(Await.result(
db.run(
(for { c <- cs } yield c).result
),
10.seconds
))
}
This is fairly straightforward; just use the monadic-ness of DBIO:
// Input B value; this is your `x` in the question.
val x: Option[B] = _
// Assume `y` is fully-initialized with a `None` `b` value.
val y: A = _
// DBIO wrapping the newly-inserted ID, if `x` is set.
val maybeInsertX: DBIO[Option[Int]] = x match {
case Some(xToInsert) =>
// Insert and return the new ID.
val newId: DBIO[Int] = bs.returning(bs.map(_.id)) += xToInsert
// Map to the expected Option.
newId.map(Some(_))
case None =>
// No x means no ID.
DBIO.successful(None)
}
// Now perform your insert, copying in the newly-generated ID.
val insertA: DBIO[Int] = maybeInsertX.flatMap(bIdOption =>
as += y.copy(b = bIdOption)
)
// Run transactionally.
db.run(insertA.transactionally)

Find all indices of a search term in a string

I need a fast method to find all indices of a search term that might occur in a string. I tried this 'brute force' String extension method:
// Note: makes use of ExSwift
extension String
{
var length: Int { return count(self) }
func indicesOf(searchTerm:String) -> [Int] {
var indices = [Int]()
for i in 0 ..< self.length {
let segment = self[i ... (i + searchTerm.length - 1)]
if (segment == searchTerm) {
indices.append(i)
}
}
return indices;
}
}
... But it's ridiculously slow, especially the shorter the search term is. What would be a better method to find all indices fast?
As Martin said you can implement some of the well known fastest algorithms in String Matching, The Knuth–Morris–Pratt string searching algorithm (or KMP algorithm) searches for occurrences of a "word" W within a main "text string" S.
The algorithm has complexity O(n), where n is the length of S and the O is big-O notation.
extension String {
// Build pi function of prefixes
private func build_pi(str: String) -> [Int] {
var n = count(str)
var pi = Array(count: n + 1, repeatedValue: 0)
var k = -1
pi[0] = -1
for (var i = 0; i < n; ++i) {
while (k >= 0 && str[k] != str[i]) {
k = pi[k]
}
pi[i + 1] = ++k
}
return pi
}
// Knuth-Morris Pratt algorithm
func searchPattern(pattern: String) -> [Int] {
var matches = [Int]()
var n = count(self)
var m = count(pattern)
var k = 0
var pi = build_pi(pattern)
for var i = 0; i < n; ++i {
while (k >= 0 && (k == m || pattern[k] != self[i])) {
k = pi[k]
}
if ++k == m {
matches.append(i - m + 1)
}
}
return matches
}
subscript (i: Int) -> Character {
return self[advance(self.startIndex, i)]
}
}
Then you can use it in the following way:
var string = "apurba mandal loves ayoshi loves"
var pattern = "loves"
println(string.searchPattern(pattern))
An the output should be :
[14, 27]
That belong to the start index of the pattern occurrences inside the the string. I hope this help you.
EDIT:
As Martin said in his comment you need to avoid the use of the advance function to index an String by an Int because it's O(position to index).
One possible solution is to convert the String to an array of Character and then access to the indexes is O(1).
Then the extension can be changed to this one :
extension String {
// Build pi function of prefixes
private func build_pi(str: [Character]) -> [Int] {
var n = count(str)
var pi = Array(count: n + 1, repeatedValue: 0)
var k = -1
pi[0] = -1
for (var i = 0; i < n; ++i) {
while (k >= 0 && str[k] != str[i]) {
k = pi[k]
}
pi[i + 1] = ++k
}
return pi
}
// Knuth-Morris Pratt algorithm
func searchPattern(pattern: String) -> [Int] {
// Convert to Character array to index in O(1)
var patt = Array(pattern)
var S = Array(self)
var matches = [Int]()
var n = count(self)
var m = count(pattern)
var k = 0
var pi = build_pi(patt)
for var i = 0; i < n; ++i {
while (k >= 0 && (k == m || patt[k] != S[i])) {
k = pi[k]
}
if ++k == m {
matches.append(i - m + 1)
}
}
return matches
}
}
Instead of checking for the search term at each position of the string
you could use rangeOfString() to find the next occurrence (hoping
that rangeOfString() uses more advanced algorithms):
extension String {
func indicesOf(searchTerm:String) -> [Int] {
var indices = [Int]()
var pos = self.startIndex
while let range = self.rangeOfString(searchTerm, range: pos ..< self.endIndex) {
indices.append(distance(self.startIndex, range.startIndex))
pos = range.startIndex.successor()
}
return indices
}
}
Generally, it depends on the size of the input string and the size
of the search string which algorithm is "the fastest". You'll find
an overview with links to various algorithms in
String searching algorithm.
Update for Swift 3:
extension String {
func indices(of searchTerm:String) -> [Int] {
var indices = [Int]()
var pos = self.startIndex
while let range = range(of: searchTerm, range: pos ..< self.endIndex) {
indices.append(distance(from: startIndex, to: range.lowerBound))
pos = index(after: range.lowerBound)
}
return indices
}
}
Using NSRegularExpression in Swift 4, you can do it like this. NSRegularExpression has been around forever and is probably a better choice than rolling your own algorithm for most cases.
let text = "The quieter you become, the more you can hear."
let searchTerm = "you"
let regex = try! NSRegularExpression(pattern: searchTerm, options: [])
let range: NSRange = NSRange(text.startIndex ..< text.endIndex, in: text)
let matches: [NSTextCheckingResult] = regex.matches(in: text, options: [], range: range)
let ranges: [NSRange] = matches.map { $0.range }
let indices: [Int] = ranges.map { $0.location }
let swiftRanges = ranges.map { Range($0, in: text) }
let swiftIndices: [String.Index] = swiftRanges.flatMap { $0?.lowerBound }

How doing String-Programming in Swift

I miss usable String-functions, that are easy to use, without typing lines of strange identifiers. So I decided to built up a libary with useful and recognicable String-Functions.
I first tried to use Cocoa String-Functions to solve this problem. So I tried in the playground:
import Cocoa
func PartOfString(s: String, start: Int, length: Int) -> String
{
return s.substringFromIndex(advance(s.startIndex, start - 1)).substringToIndex(advance(s.startIndex, length))
}
PartOfString("HelloaouAOUs.World", 1, 5) --> "Hello"
PartOfString("HelloäöüÄÖÜß…World", 1, 5) --> "Hello"
PartOfString("HelloaouAOUs.World", 1, 18) --> "HelloaouAOUs.World"
PartOfString("HelloäöüÄÖÜß…World", 1, 18) --> "HelloäöüÄÖÜß…World"
PartOfString("HelloaouAOUs.World", 6, 7) --> "aouAOUs"
PartOfString("HelloäöüÄÖÜß…World", 6, 7) --> "äöüÄO"
If UnCode Characters are in the String for the case, that "substringFromIndex" is not the Start-Index. And even worse, the Swift-Program crashes sometimes at running time, if UnCode-Characters are in a String, for the case, that "substringFromIndex" is not the Start-Index. So I decided to create a set of new Functions, that take care of this problem and work with UnCode-Characters. Please note, that filenames can contain UnCode-Characters as well. So if you think you do not need UnCode-Characters you are wrong.
If you want to reproduce this, you need the same String I used, because copying from this Web-Page does not reproduce the problem.
var s: String = "HelloäöüÄÖÜß…World"
var t: String = s.stringByAddingPercentEscapesUsingEncoding(NSUTF8StringEncoding)!
var u: String = "Helloa%CC%88o%CC%88u%CC%88A%CC%88O%CC%88U%CC%88%C3%9F%E2%80%A6World".stringByRemovingPercentEncoding!
var b: Bool = (s == u) --> true
PartOfString(s, 6, 7) --> "äöüÄO"
Now you could get the idea, to convert the disturbing Canonical-Mapping UniCodes to compatible one with the following function:
func percentescapesremove (s: String) -> String
{
return (s.stringByRemovingPercentEncoding!.precomposedStringWithCompatibilityMapping)
}
And the result you will get is:
var v: String = percentescapesremove(t) --> "HelloäöüÄÖÜß...World"
PartOfString(v, 6, 7) --> "äöüÄÖÜß"
var a: Bool = (s == v) --> false
When you do so, the "äöüÄÖÜß" looks good and you think, everything is OK but look at the "..." which has been permanently converted from UniCode "…" to non-UniCode "..." and has the result which is not identically to the first string. If you have UniCode-filenames, then converting will result in not finding the file on a volume. So it is a good idea to convert only for scree-output and keep the original String in a save place.
The problem with the PartOfString-Function above is, that it generates a new String in the first part of the assignment and uses this new String with the index of the old one, which does not work, because the UniCodes have a different length than the normal letters. So I improved the funktion (thank to Martin R for his help):
func NewPartOfString(s: String, start: Int, length: Int) -> String
{
let t: String = s.substringFromIndex(advance(s.startIndex, start - 1))
return t.substringToIndex(advance(t.startIndex, length))
}
And the result is correct:
NewPartOfString("HelloaouAOUs.World", 1, 5) --> "Hello"
NewPartOfString("HelloäöüÄÖÜß…World", 1, 5) --> "Hello"
NewPartOfString("HelloaouAOUs.World", 1, 18) --> "HelloaouAOUs.World"
NewPartOfString("HelloäöüÄÖÜß…World", 1, 18) --> "HelloäöüÄÖÜß…World"
NewPartOfString("HelloaouAOUs.World", 6, 7) --> "aouAOUs"
NewPartOfString("HelloäöüÄÖÜß…World", 6, 7) --> "äöüÄÖÜß"
In the next step I will show a few functions, that can be used and work well. All of them are based on Integer-Index-Values that will start at 1 for the first character end end with the index for the last character being identically to the length of the String.
This function returns the length of a string:
func len (s: String) -> Int
{
return (countElements(s)) // This works not really fast, because of UniCode
}
This function returns the UniCode-Number of the first UniCode-Character in the String:
func asc (s: String) -> Int
{
if (s == "")
{
return 0
}
else
{
return (Int(s.unicodeScalars[s.unicodeScalars.startIndex].value))
}
}
This function returns the UniCode-Character of the given UniCode-Number:
func char (c: Int) -> String
{
var s: String = String(UnicodeScalar(c))
return (s)
}
This function returns the Upper-Case representation of a String:
func ucase (s: String) -> String
{
return (s.uppercaseString)
}
This function returns the Lower-Case representation of a String:
func lcase (s: String) -> String
{
return (s.lowercaseString)
}
The next Function gives the left part of a String with a given length:
func left (s: String, length: Int) -> String
{
if (length < 1)
{
return ("")
}
else
{
if (length > len(s))
{
return (s)
}
else
{
return (s.substringToIndex(advance(s.startIndex, length)))
}
}
}
The next Function gives the right part of a String with a given length:
func right (s: String, laenge: Int) -> String
{
var L: Int = len(s)
if (L <= laenge)
{
return(s)
}
else
{
if (laenge < 1)
{
return ("")
}
else
{
let t: String = s.substringFromIndex(advance(s.startIndex, L - laenge))
return t.substringToIndex(advance(t.startIndex, laenge))
}
}
}
The next Function gives the part of a String with a given length:
func mid (s: String, start: Int, laenge: Int) -> String
{
if (start <= 1)
{
return (left(s, laenge))
}
else
{
var L: Int = len(s)
if ((start > L) || (laenge < 1))
{
return ("")
}
else
{
if (start + laenge > L)
{
let t: String = s.substringFromIndex(advance(s.startIndex, start - 1))
return t.substringToIndex(advance(t.startIndex, L - start + 1))
}
else
{
let t: String = s.substringFromIndex(advance(s.startIndex, start - 1))
return t.substringToIndex(advance(t.startIndex, laenge))
}
}
}
}
A little more difficult is to get a character at a given position, because we cannot use "substringFromIndex" and "substringToIndex" with "substringFromIndex" is not the Start-Index. So the idea is to trace through the string, character for character, and get the needed substring.
func CharacterOfString(s: String, index: Int, length: Int) -> String
{
var c: String = ""
var i: Int = 0
for UniCodeChar in s.unicodeScalars
{
i = i + 1
if ((i >= index) && (i < index + length))
{
c = c + String(UniCodeChar)
}
}
return (c)
}
But this works not correctly for Strings which contain UniCode-Characters. The following examples show what happens:
CharacterOfString("Swift Example Text aouAOUs.", 16, 8) --> "ext aouA"
len(CharacterOfString("Swift Example Text aouAOUs.", 16, 8)) --> 8
CharacterOfString("Swift Example Text äöüÄÖÜß…", 16, 8) --> "ext äö"
len(CharacterOfString("Swift Example Text äöüÄÖÜß…", 16, 8)) --> 6
So we see, that the resulting String is too short, because a UniCode-Character can contain more than one character. This is because "ä" can be one UniCode-Character and also written as two "a¨" UniCode-Character. So we need another way to get a valid substring.
The solution is, to convert the UniCode-String to an array of UniCode-Characters and to use the index af the array to get a valid character. This works in all cases to get a single Character of an UniCode-String at a given index:
func indchar (s: String, i: Int) -> String
{
if ((i < 1) || (i > len(s)))
{
return ("")
}
else
{
return String(Array(s)[i - 1])
}
}
And with this knowledge, I have built a Function, which can get a valid UniCode-Substring with a given Start-Index and a given length:
func substring(s: String, Start: Int, Length: Int) -> String
{
var L: Int = len(s)
var UniCode = Array(s)
var result: String = ""
var TheEnd: Int = Start + Length - 1
if ((Start < 1) || (Start > L))
{
return ("")
}
else
{
if ((Length < 0) || (TheEnd > L))
{
TheEnd = L
}
for var i: Int = Start; i <= TheEnd; ++i
{
result = result + String(UniCode[i - 1])
}
return (result)
}
}
The next Function searches for the position of a given String in another String:
func position (original: String, search: String, start: Int) -> Int
{
var index = part(original, start).rangeOfString(search)
if (index != nil)
{
var pos: Int = distance(original.startIndex, index!.startIndex)
return (pos + start)
}
else
{
return (0)
}
}
This function looks, if a given Character-Code is a number (0-9):
func number (n: Int) -> Bool
{
return ((n >= 48) & (n <= 57)) // "0" to "9"
}
Now the basic String-Operations are shown, but what about Numbers? How will numbers converted to Strings and vice versa? Let's have a look at converting Strings to Numbers. Please not the "!" in the second line, which is used to get a Int and not an optional Int.
var s: String = "123" --> "123"
var v: Int = s.toInt() --> (123)
var v: Int = s.toInt()! --> 123
But this does not work, if the String contains some characters:
var s: String = "123." --> "123."
var v: Int = s.toInt()! --> Will result in a Runtime Error, because s.toInt() = nil
So I decided to built a smater Function to get the value of a String:
func val (s: String) -> Int
{
var p: Int = 0
var sign: Int = 0
if (indchar(s, 1) == "-")
{
sign = 1
p = 1
}
while(number(asc(indchar(s, p + 1))))
{
p = p + 1
}
if (p > sign)
{
return (left(s, p).toInt()!)
}
else
{
return (0)
}
}
Now the result is correct and does not produce a Runtime-Error:
var s: String = "123." --> "123."
var v: Int = val(s) --> 123
And now the same for Floating-Point Numbers:
func realval (s: String) -> Double
{
var r: Double = 0
var p: Int = 1
var a: Int = asc(indchar(s, p))
if (indchar(s, 1) == "-")
{
p = 2
}
while ((a != 44) && (a != 46) && ((a >= 48) & (a <= 57)))
{
p = p + 1
a = asc(indchar(s, p))
}
if (p >= len(s)) // Integer Number
{
r = Double(val(s))
}
else // Number with fractional part
{
var mantissa: Int = val(substring(s, p + 1, -1))
var fract: Double = 0
while (mantissa != 0)
{
fract = (fract / 10) + (Double(mantissa % 10) / 10)
mantissa = mantissa / 10
p = p + 1
}
r = Double(val(s)) + fract
p = p + 1
}
a = asc(indchar(s, p))
if ((a == 69) || (a == 101)) // Exponent
{
var exp: Int = val(substring(s, p + 1, -1))
if (exp != 0)
{
for var i: Int = 1; i <= abs(exp); ++i
{
if (exp > 0)
{
r = r * 10
}
else
{
r = r / 10
}
}
}
}
return (r)
}
This works for Floating points numbers with exponents:
var s: String = "123.456e3"
var t: String = "123.456e-3"
var v: Double = realval(s) --> 123456
var w: Double = realval(t) --> 0.123456
To generate a String from an Integer is much more simple:
func str (n: Int) -> String
{
return (String(n))
}
A String of a floating point variable does not work with String(n) but can be done with:
func strreal (n: Double) -> String
{
return ("\(n)")
}

Truncate text to get preview in Scala

I need to truncate a text to get a preview. The preview is the text prefix of ~N chars (but not more) and it should not split words in the middle.
preview("aaa", 10) = "aaa"
preview("a b c", 10) = "a b c"
preview("aaa bbb", 5) = "aaa"
preview("a b ccc", 3) = "a b"
I coded a function as follows:
def preview(s:String, n:Int) =
if (s.length <= n) s else s.take(s.lastIndexOf(' ', n))
Would you change or fix it ?
Now I am thinking how to handle the case when the text words are separated by one or more white spaces (including \n,\t, etc.) rather than just a single space. How would you improve the function to handle this case ?
How about the following:
def preview(s: String, n: Int) = if (s.length <= n) {
s
} else {
s.take(s.lastIndexWhere(_.isSpaceChar, n + 1)).trim
}
This function will:
For the strings shorter or equal n return the string (no preview required)
Otherwise find the the last space character in the n + 1 first characters (this will indicate whether the last world is being split, as if it's not than n + 1 will be a space chracter and otherwise a non-space character) and take a string up to this point
Note: The usage of isSpaceChar will not only provide support for space, but also new line or paragraph, which is what I believe you're after (and you can replace it with isWhitespace if you're after even more extended set of word separators).
I propose next one:
-- UPDATED--
def ellipsize(text : String, max : Int): String = {
def ellipsize0(s : String): String =
if(s.length <= max) s
else {
val end = s.lastIndexOf(" ")
if(end == -1) s.take(max)
else ellipsize0(s.take(end))
}
ellipsize0("\\s+".r.replaceAllIn(text, " "))
}
Or your (modified):
def preview(str : String, n : Int) = {
(s : String) => if (s.length <= n) s else s.take(s.lastIndexOf(' ', n))
}.apply( "\\s+".r.replaceAllIn(str, " "))
How about this
def preview(s:String, n:Int) =
if (s.length <= n) s
else s.take(n).takeWhile(_ != ' ')
Try it here: http://scalafiddle.net/console/a05d886123a54de3ca4b0985b718fb9b
This seems to work:
// find the last word that is not split by n, then take to its end
def preview(text: String, n: Int): String =
text take (("""\S+""".r findAllMatchIn text takeWhile (_.end <= n)).toList match {
case Nil => n
case ms => ms.last.end
})
An alternative take (pun intended) but doesn't like input of all whitespace:
text take (("""\S+""".r findAllMatchIn text takeWhile (m => m.start == 0 || m.end <= n)).toList.last.end min n)
Extensionally:
object Previewer {
implicit class `string preview`(val text: String) extends AnyVal {
// find the last word that is not split by n, then take to its end
def preview(n: Int): String =
text take (("""\S+""".r findAllMatchIn text takeWhile (_.end <= n)).toList match {
case Nil => n
case ms => ms.last.end
})
}
}
Looks nice that way:
class PreviewTest {
import Previewer._
#Test def shorter(): Unit = {
assertEquals("aaa", "aaa" preview 10)
}
#Test def spacey(): Unit = {
assertEquals("a b c", "a b c" preview 10)
}
#Test def split(): Unit = {
assertEquals("abc", "abc cba" preview 5)
}
#Test def onspace(): Unit = {
assertEquals("a b", "a b cde" preview 3)
}
#Test def trimming(): Unit = {
assertEquals("a b", "a b cde" preview 5)
}
#Test def none(): Unit = {
assertEquals(" " * 5, " " * 8 preview 5)
}
#Test def prefix(): Unit = {
assertEquals("a" * 5, "a" * 10 preview 5)
}
}

Resources