76
edits
Changes
m
<Code>
#!/usr/bin/env python
import pysparkimport sys #!/usr/bin/env python
sc = pyspark.SparkContext()
lines = sc.textFile(sys.argv[1])
words = lines.flatMap(lambda line: line.split())
wordCounts = words.map(lambda word: (word, 1)).reduceByKey(lambda count1, count2: count1 + count2)
wordCounts.saveAsTextFile(sys.argv[2])
</Code>
→Analysis: Spark vs Hadoop
# To open Browser: Menu -> Storage -> Browser
# Drag and drop the below word-count.py into the browser, or use 'UPLOAD FILES' to upload.
import pyspark import sys if len(sys.argv) != 3:
raise Exception("Exactly 2 arguments are required: <inputUri> <outputUri>")
inputUri=sys.argv[1] outputUri=sys.argv[2] sc = pyspark.SparkContext() lines = sc.textFile(sys.argv[1]) words = lines.flatMap(lambda line: line.split()) wordCounts = words.map(lambda word: (word, 1)).reduceByKey(lambda count1, count2: count1 + count2) wordCounts.saveAsTextFile(sys.argv[2])
=== Results ===