Polyglot CheatSheet - Word Count
Last Updated: 2021-11-19
Old School Java
Map<String, Integer> wordcount = new HashMap<>();
for (String line : lines) {
String[] words = line.split("\\W+");
for (String word : words) {
String lowerCasedWord = word.toLowerCase();
wordcount.put(lowerCasedWord, wordcount.getOrDefault(lowerCasedWord, 0) + 1);
}
}
Java 8 Lambda
lines.stream()
.flatMap(line -> Stream.of(line.split("\\W+")))
.collect(Collectors.groupingBy(String::toLowerCase, Collectors.summingInt(s -> 1)));
Pig
A = load './input.txt';
B = foreach A generate flatten(TOKENIZE((chararray)$0)) as word;
C = group B by word;
D = foreach C generate COUNT(B), group;
store D into './wordcount';
Scala
Source.fromFile(filename).getLines()
.flatMap(_.split("\\W+"))
.foldLeft(Map.empty[String, Int])((count, word) => count + (word -> (count.getOrElse(word, 0) + 1)))
more
def useFoldLeft(): Map[String, Int] = {
Source.fromFile(filename).getLines()
.flatMap(_.split("\\W+"))
.foldLeft(Map.empty[String, Int])((count, word) => count + (word -> (count.getOrElse(word, 0) + 1)))
}
def useGroupBy() = {
Source.fromFile(filename).getLines()
.flatMap(_.split("\\W+")).toList
.groupBy((word: String) => word)
.mapValues(_.length)
}
def useGroupByWithIdentity() = {
Vector
Source.fromFile(filename).getLines()
.flatMap(_.split("\\W+")).toList
.groupBy(identity)
.mapValues(_.length)
}
Spark
val conf = new SparkConf().setAppName("WordCount").setMaster("local[2]")
val sc = new SparkContext(conf)
val data = sc.textFile(filename)
.flatMap(_.split("\\W+"))
.map(x => (x, 1))
.reduceByKey(_ + _) // same as .reduceByKey((x, y) => x + y)
println(data.collect().mkString("\n"))