logo

Polyglot CheatSheet - Word Count

Last Updated: 2021-11-19

Old School Java

Map<String, Integer> wordcount = new HashMap<>();

for (String line : lines) {
    String[] words = line.split("\\W+");
    for (String word : words) {
        String lowerCasedWord = word.toLowerCase();
        wordcount.put(lowerCasedWord, wordcount.getOrDefault(lowerCasedWord, 0) + 1);
    }
}

Java 8 Lambda

lines.stream()
    .flatMap(line -> Stream.of(line.split("\\W+")))
    .collect(Collectors.groupingBy(String::toLowerCase, Collectors.summingInt(s -> 1)));

Pig

A = load './input.txt';
B = foreach A generate flatten(TOKENIZE((chararray)$0)) as word;
C = group B by word;
D = foreach C generate COUNT(B), group;
store D into './wordcount';

Scala

Source.fromFile(filename).getLines()
  .flatMap(_.split("\\W+"))
  .foldLeft(Map.empty[String, Int])((count, word) => count + (word -> (count.getOrElse(word, 0) + 1)))

more

  def useFoldLeft(): Map[String, Int] = {
    Source.fromFile(filename).getLines()
      .flatMap(_.split("\\W+"))
      .foldLeft(Map.empty[String, Int])((count, word) => count + (word -> (count.getOrElse(word, 0) + 1)))
  }

  def useGroupBy() = {
    Source.fromFile(filename).getLines()
      .flatMap(_.split("\\W+")).toList
      .groupBy((word: String) => word)
      .mapValues(_.length)
  }

  def useGroupByWithIdentity() = {
    Vector
    Source.fromFile(filename).getLines()
      .flatMap(_.split("\\W+")).toList
      .groupBy(identity)
      .mapValues(_.length)
  }

Spark

val conf = new SparkConf().setAppName("WordCount").setMaster("local[2]")
val sc = new SparkContext(conf)

val data = sc.textFile(filename)
  .flatMap(_.split("\\W+"))
  .map(x => (x, 1))
  .reduceByKey(_ + _)  // same as .reduceByKey((x, y) => x + y)

println(data.collect().mkString("\n"))