start-dfs.sh && start-yarn.sh
pip3 install pyspark
echo 'alias python=python3' >> ~/.bashrc
pyspark
import random
import time
NUM_SAMPLES = 100000000
def inside(p):
x, y = random.random(), random.random()
return x*x + y*y < 1
t1 = time.time()
count = sc.parallelize(range(0, NUM_SAMPLES)).filter(inside).count()
print("Time Taken: {}s".format(time.time()-t1))
pi = 4 * count / NUM_SAMPLES
print('Pi is roughly', pi)
Then run,
PYTHONSTARTUP=pi_spark.py pyspark
import time
import random
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('CalculatePi').getOrCreate()
sc = spark.sparkContext
NUM_SAMPLES = 100000000
def inside(p):
x, y = random.random(), random.random()
return x*x + y*y < 1
t1 = time.time()
count = sc.parallelize(range(0, NUM_SAMPLES)).filter(inside).count()
print('Time Taken: {} s'.format(time.time()-t1))
pi = 4 * count / NUM_SAMPLES
print('Pi is roughly', pi)
spark.stop()
Then run,
python pi_spark2.py
# or
spark-submit pi_spark2.py