1
votes

I'm trying to use pyspark.mllib.stat.KernelDensity this way:

data = sc.parallelize([0, 1, 2, 2, 1, 1, 1, 1, 1, 2, 0, 0])
kd = KernelDensity()
kd.setSample(data)
kd.setBandwidth(3)
densities = kd.estimate([-1.0, 2.0, 5.0])

but eventually get this error:

--------------------------------------------------------------------------- Py4JError Traceback (most recent call last) in () 8 9 # Find density estimates for the given values ---> 10 densities = kd.estimate([-1.0, 2.0, 5.0])

/home/user10215193/anaconda3/lib/python3.6/site-packages/pyspark/mllib/stat/KernelDensity.py in estimate(self, points) 56 points = list(points) 57 densities = callMLlibFunc( ---> 58 "estimateKernelDensity", self._sample, self._bandwidth, points) 59 return np.asarray(densities)

/home/user10215193/anaconda3/lib/python3.6/site-packages/pyspark/mllib/common.py in callMLlibFunc(name, *args) 129 api = getattr(sc._jvm.PythonMLLibAPI(), name) 130 print(api) --> 131 return callJavaFunc(sc, api, *args) 132 133

/home/user10215193/anaconda3/lib/python3.6/site-packages/pyspark/mllib/common.py in callJavaFunc(sc, func, *args) 121 """ Call Java Function """ 122 args = [_py2java(sc, a) for a in args] --> 123 return _java2py(sc, func(*args)) 124 125

/home/user10215193/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py in call(self, *args) 1131 answer = self.gateway_client.send_command(command) 1132 return_value = get_return_value( -> 1133 answer, self.gateway_client, self.target_id, self.name) 1134 1135 for temp_arg in temp_args:

/home/user10215193/anaconda3/lib/python3.6/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 321 raise Py4JError( 322 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n". --> 323 format(target_id, ".", name, value)) 324 else: 325 raise Py4JError(

Py4JError: An error occurred while calling o19.estimateKernelDensity. Trace: py4j.Py4JException: Method estimateKernelDensity([class org.apache.spark.api.java.JavaRDD, class java.lang.Integer, class java.util.ArrayList]) does not exist at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318) at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326) at py4j.Gateway.invoke(Gateway.java:272) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:214) at java.lang.Thread.run(Thread.java:748)

I couldn't find anything similar here so if somebody can help me with this I would much appreciate it.

1

1 Answers

1
votes

You have to be careful about the types:

  • bandwidth has to be float
  • sample has to be RDD[float]

So replace your code with:

kd.setSample(data.map(float))
kd.setBandwidth(3.0)
densities = kd.estimate([-1.0, 2.0, 5.0])

and you'll be fine.