Py4JJavaError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_22732/1401292359.py in
----> 1 feat_df.show(5, vertical=True)
D:\Anaconda3\envs\recall-service-cp4\lib\site-packages\pyspark\sql\dataframe.py in show(self, n, truncate, vertical)
482 """
483 if isinstance(truncate, bool) and truncate:
--> 484 print(self._jdf.showString(n, 20, vertical))
485 else:
486 print(self._jdf.showString(n, int(truncate), vertical))
D:\Anaconda3\envs\recall-service-cp4\lib\site-packages\py4j\java_gateway.py in call (self, *args)
1360
1361 answer = self.gateway_client.send_command(command)
-> 1362 return_value = get_return_value(
1363 answer, self.gateway_client, self.target_id, self.name)
1364
D:\Anaconda3\envs\recall-service-cp4\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
D:\Anaconda3\envs\recall-service-cp4\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
325 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
326 if answer[1] == REFERENCE_TYPE:
--> 327 raise Py4JJavaError(
328 "An error occurred while calling {0}{1}{2}.\n".
329 format(target_id, ".", name), value)
:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 8 in stage 5.0 failed 1 times, most recent failure: Lost task 8.0 in stage 5.0 (TID 24) (dy-20240328QDOX.lan executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:182)
at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:107)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:119)
at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:145)
at org.apache.spark.sql.execution.python.BatchEvalPythonExec.evaluate(BatchEvalPythonExec.scala:81)
at org.apache.spark.sql.execution.python.EvalPythonExec. a n o n f u n anonfun anonfundoExecute 2 ( E v a l P y t h o n E x e c . s c a l a : 130 ) a t o r g . a p a c h e . s p a r k . r d d . R D D . 2(EvalPythonExec.scala:130) at org.apache.spark.rdd.RDD. 2(EvalPythonExec.scala:130)atorg.apache.spark.rdd.RDD.anonfun$mapPartitions 2 ( R D D . s c a l a : 863 ) a t o r g . a p a c h e . s p a r k . r d d . R D D . 2(RDD.scala:863) at org.apache.spark.rdd.RDD. 2(RDD.scala:863)atorg.apache.spark.rdd.RDD.anonfun$mapPartitions 2 2 2adapted(RDD.scala:863)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor T a s k R u n n e r . TaskRunner. TaskRunner.anonfun$run 3 ( E x e c u t o r . s c a l a : 497 ) a t o r g . a p a c h e . s p a r k . u t i l . U t i l s 3(Executor.scala:497) at org.apache.spark.util.Utils 3(Executor.scala:497)atorg.apache.spark.util.Utils.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor T a s k R u n n e r . r u n ( E x e c u t o r . s c a l a : 500 ) a t j a v a . u t i l . c o n c u r r e n t . T h r e a d P o o l E x e c u t o r . r u n W o r k e r ( T h r e a d P o o l E x e c u t o r . j a v a : 1149 ) a t j a v a . u t i l . c o n c u r r e n t . T h r e a d P o o l E x e c u t o r TaskRunner.run(Executor.scala:500) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor TaskRunner.run(Executor.scala:500)atjava.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)atjava.util.concurrent.ThreadPoolExecutorWorker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketTimeoutException: Accept timed out
at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:135)
at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:199)
at java.net.ServerSocket.implAccept(ServerSocket.java:545)
at java.net.ServerSocket.accept(ServerSocket.java:513)
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:174)
... 26 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
at org.apache.spark.scheduler.DAGScheduler. a n o n f u n anonfun anonfunabortStage 2 ( D A G S c h e d u l e r . s c a l a : 2207 ) a t o r g . a p a c h e . s p a r k . s c h e d u l e r . D A G S c h e d u l e r . 2(DAGScheduler.scala:2207) at org.apache.spark.scheduler.DAGScheduler. 2(DAGScheduler.scala:2207)atorg.apache.spark.scheduler.DAGScheduler.anonfun$abortStage 2 2 2adapted(DAGScheduler.scala:2206)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach ( R e s i z a b l e A r r a y . s c a l a : 55 ) a t s c a l a . c o l l e c t i o n . m u t a b l e . A r r a y B u f f e r . f o r e a c h ( A r r a y B u f f e r . s c a l a : 49 ) a t o r g . a p a c h e . s p a r k . s c h e d u l e r . D A G S c h e d u l e r . a b o r t S t a g e ( D A G S c h e d u l e r . s c a l a : 2206 ) a t o r g . a p a c h e . s p a r k . s c h e d u l e r . D A G S c h e d u l e r . (ResizableArray.scala:55) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206) at org.apache.spark.scheduler.DAGScheduler. (ResizableArray.scala:55)atscala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)atorg.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)atorg.apache.spark.scheduler.DAGScheduler.anonfunhandleTaskSetFailed 1 ( D A G S c h e d u l e r . s c a l a : 1079 ) a t o r g . a p a c h e . s p a r k . s c h e d u l e r . D A G S c h e d u l e r . 1(DAGScheduler.scala:1079) at org.apache.spark.scheduler.DAGScheduler. 1(DAGScheduler.scala:1079)atorg.apache.spark.scheduler.DAGScheduler.anonfunhandleTaskSetFailed 1 1 1adapted(DAGScheduler.scala:1079)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
at org.apache.spark.util.EventLoop$$anon 1. r u n ( E v e n t L o o p . s c a l a : 49 ) a t o r g . a p a c h e . s p a r k . s c h e d u l e r . D A G S c h e d u l e r . r u n J o b ( D A G S c h e d u l e r . s c a l a : 868 ) a t o r g . a p a c h e . s p a r k . S p a r k C o n t e x t . r u n J o b ( S p a r k C o n t e x t . s c a l a : 2196 ) a t o r g . a p a c h e . s p a r k . S p a r k C o n t e x t . r u n J o b ( S p a r k C o n t e x t . s c a l a : 2217 ) a t o r g . a p a c h e . s p a r k . S p a r k C o n t e x t . r u n J o b ( S p a r k C o n t e x t . s c a l a : 2236 ) a t o r g . a p a c h e . s p a r k . s q l . e x e c u t i o n . S p a r k P l a n . e x e c u t e T a k e ( S p a r k P l a n . s c a l a : 472 ) a t o r g . a p a c h e . s p a r k . s q l . e x e c u t i o n . S p a r k P l a n . e x e c u t e T a k e ( S p a r k P l a n . s c a l a : 425 ) a t o r g . a p a c h e . s p a r k . s q l . e x e c u t i o n . C o l l e c t L i m i t E x e c . e x e c u t e C o l l e c t ( l i m i t . s c a l a : 47 ) a t o r g . a p a c h e . s p a r k . s q l . D a t a s e t . c o l l e c t F r o m P l a n ( D a t a s e t . s c a l a : 3696 ) a t o r g . a p a c h e . s p a r k . s q l . D a t a s e t . 1.run(EventLoop.scala:49) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236) at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:472) at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:425) at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47) at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3696) at org.apache.spark.sql.Dataset. 1.run(EventLoop.scala:49)atorg.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)atorg.apache.spark.SparkContext.runJob(SparkContext.scala:2196)atorg.apache.spark.SparkContext.runJob(SparkContext.scala:2217)atorg.apache.spark.SparkContext.runJob(SparkContext.scala:2236)atorg.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:472)atorg.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:425)atorg.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)atorg.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3696)atorg.apache.spark.sql.Dataset.anonfunhead 1 ( D a t a s e t . s c a l a : 2722 ) a t o r g . a p a c h e . s p a r k . s q l . D a t a s e t . 1(Dataset.scala:2722) at org.apache.spark.sql.Dataset. 1(Dataset.scala:2722)atorg.apache.spark.sql.Dataset.anonfunwithAction 1 ( D a t a s e t . s c a l a : 3687 ) a t o r g . a p a c h e . s p a r k . s q l . e x e c u t i o n . S Q L E x e c u t i o n 1(Dataset.scala:3687) at org.apache.spark.sql.execution.SQLExecution 1(Dataset.scala:3687)atorg.apache.spark.sql.execution.SQLExecution. a n o n f u n anonfun anonfunwithNewExecutionId 5 ( S Q L E x e c u t i o n . s c a l a : 103 ) a t o r g . a p a c h e . s p a r k . s q l . e x e c u t i o n . S Q L E x e c u t i o n 5(SQLExecution.scala:103) at org.apache.spark.sql.execution.SQLExecution 5(SQLExecution.scala:103)atorg.apache.spark.sql.execution.SQLExecution.withSQLConfPropagated(SQLExecution.scala:163)
at org.apache.spark.sql.execution.SQLExecution . . .anonfun$withNewExecutionId 1 ( S Q L E x e c u t i o n . s c a l a : 90 ) a t o r g . a p a c h e . s p a r k . s q l . S p a r k S e s s i o n . w i t h A c t i v e ( S p a r k S e s s i o n . s c a l a : 775 ) a t o r g . a p a c h e . s p a r k . s q l . e x e c u t i o n . S Q L E x e c u t i o n 1(SQLExecution.scala:90) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775) at org.apache.spark.sql.execution.SQLExecution 1(SQLExecution.scala:90)atorg.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)atorg.apache.spark.sql.execution.SQLExecution.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2722)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2929)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:301)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:338)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:182)
at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:107)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:119)
at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:145)
at org.apache.spark.sql.execution.python.BatchEvalPythonExec.evaluate(BatchEvalPythonExec.scala:81)
at org.apache.spark.sql.execution.python.EvalPythonExec. a n o n f u n anonfun anonfundoExecute 2 ( E v a l P y t h o n E x e c . s c a l a : 130 ) a t o r g . a p a c h e . s p a r k . r d d . R D D . 2(EvalPythonExec.scala:130) at org.apache.spark.rdd.RDD. 2(EvalPythonExec.scala:130)atorg.apache.spark.rdd.RDD.anonfunmapPartitions 2 ( R D D . s c a l a : 863 ) a t o r g . a p a c h e . s p a r k . r d d . R D D . 2(RDD.scala:863) at org.apache.spark.rdd.RDD. 2(RDD.scala:863)atorg.apache.spark.rdd.RDD.anonfunmapPartitions 2 2 2adapted(RDD.scala:863)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor T a s k R u n n e r . TaskRunner. TaskRunner.anonfun$run 3 ( E x e c u t o r . s c a l a : 497 ) a t o r g . a p a c h e . s p a r k . u t i l . U t i l s 3(Executor.scala:497) at org.apache.spark.util.Utils 3(Executor.scala:497)atorg.apache.spark.util.Utils.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor T a s k R u n n e r . r u n ( E x e c u t o r . s c a l a : 500 ) a t j a v a . u t i l . c o n c u r r e n t . T h r e a d P o o l E x e c u t o r . r u n W o r k e r ( T h r e a d P o o l E x e c u t o r . j a v a : 1149 ) a t j a v a . u t i l . c o n c u r r e n t . T h r e a d P o o l E x e c u t o r TaskRunner.run(Executor.scala:500) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor TaskRunner.run(Executor.scala:500)atjava.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)atjava.util.concurrent.ThreadPoolExecutorWorker.run(ThreadPoolExecutor.java:624)
... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:135)
at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:199)
at java.net.ServerSocket.implAccept(ServerSocket.java:545)
at java.net.ServerSocket.accept(ServerSocket.java:513)
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:174)
... 26 more
- 解决办法
加入python路径
python
import os
os.environ['PYSPARK_PYTHON'] = "D:\\Anaconda3\\envs\\recall-service-cp4\\python.exe"