How do I find the count of missing value in a pyspark data frame - pyspark

I'm using pyspark 3.2.1. I'm trying to find missing value count in each of the column of my pyspark data frame.
So I used following code
dataColumns=['columns in my data frame'][count(when(isnan(c), c)).alias(c) for c in dataColumns]).show(truncate=False)
But I got error message
AnalysisException Traceback (most recent call last)
<ipython-input-56-6c7766e33c77> in <module>()
1 dataColumns=['myDate']
----> 2[count(when(isnan(c), c)).alias(c) for c in dataColumns]).show(truncate=False)
/usr/local/spark/python/pyspark/sql/ in select(self, *cols)
1667 [Row(name='Alice', age=12), Row(name='Bob', age=15)]
1668 """
-> 1669 jdf =*cols))
1670 return DataFrame(jdf, self.sql_ctx)
/usr/local/spark/python/lib/ in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id,
1307 for temp_arg in temp_args:
/usr/local/spark/python/pyspark/sql/ in deco(*a, **kw)
115 # Hide where the exception came from that shows a non-Pythonic
116 # JVM exception message.
--> 117 raise converted from None
118 else:
119 raise
AnalysisException: cannot resolve 'isnan(`myDate`)' due to data type mismatch: argument 1 requires (double or float) type, however, '`myDate`' is of timestamp type.;
'Aggregate [count(CASE WHEN isnan(myDate#1994) THEN myDate END) AS myDate#5831]
Can you please help me to resolve this issue?


Error when loading pipelines in spaCy 3.0

After updating to spaCy 3.0.6 I haven't been able to load in either of the trained pipelines, although both seem to be properly installed:
================= Installed pipeline packages (spaCy v3.0.6) =================
ℹ spaCy installation:
en_core_web_sm >=3.0.0,<3.1.0 3.0.0 ✔
en_core_web_trf >=3.0.0,<3.1.0 3.0.0 ✔
This occcurs when using spacy.load() and importing the pipelines as a module (error is identical for all of the following lines):
nlp = spacy.load("en_core_web_trf")
nlp = spacy.load("en_core_web_sm")
import en_core_web_sm
nlp = en_core_web_sm.load()
import en_core_web_trf
nlp = en_core_web_trf.load()
The error I'm getting is the following:
ImportError Traceback (most recent call last)
<ipython-input-9-b38eb3aae320> in <module>
1 import en_core_web_trf
----> 2 nlp = en_core_web_trf.load()
~/anaconda3/envs/ml/lib/python3.8/site-packages/en_core_web_trf/ in load(**overrides)
9 def load(**overrides):
---> 10 return load_model_from_init_py(__file__, **overrides)
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy/ in load_model_from_init_py(init_file, vocab, disable, exclude, config)
514 if not model_path.exists():
515 raise IOError(Errors.E052.format(path=data_path))
--> 516 return load_model_from_path(
517 data_path,
518 vocab=vocab,
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy/ in load_model_from_path(model_path, meta, vocab, disable, exclude, config)
389 config_path = model_path / "config.cfg"
390 config = load_config(config_path, overrides=dict_to_dot(config))
--> 391 nlp = load_model_from_config(config, vocab=vocab, disable=disable, exclude=exclude)
392 return nlp.from_disk(model_path, exclude=exclude)
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy/ in load_model_from_config(config, vocab, disable, exclude, auto_fill, validate)
426 # registry, including custom subclasses provided via entry points
427 lang_cls = get_lang_class(nlp_config["lang"])
--> 428 nlp = lang_cls.from_config(
429 config,
430 vocab=vocab,
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy/ in from_config(cls, config, vocab, disable, exclude, meta, auto_fill, validate)
1637 # then we would load them twice at runtime: once when we make from config,
1638 # and then again when we load from disk.
-> 1639 nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
1640 if after_creation is not None:
1641 nlp = after_creation(nlp)
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy/ in __init__(self, vocab, max_length, meta, create_tokenizer, batch_size, **kwargs)
148 # points. The factory decorator applied to these functions takes care
149 # of the rest.
--> 150 util.registry._entry_point_factories.get_all()
152 self._config = DEFAULT_CONFIG.merge(self.default_config)
~/anaconda3/envs/ml/lib/python3.8/site-packages/catalogue/ in get_all(self)
106 result = {}
107 if self.entry_points:
--> 108 result.update(self.get_entry_points())
109 for keys, value in REGISTRY.items():
110 if len(self.namespace) == len(keys) - 1 and all(
~/anaconda3/envs/ml/lib/python3.8/site-packages/catalogue/ in get_entry_points(self)
121 result = {}
122 for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
--> 123 result[] = entry_point.load()
124 return result
~/anaconda3/envs/ml/lib/python3.8/importlib/ in load(self)
75 """
76 match = self.pattern.match(self.value)
---> 77 module = import_module('module'))
78 attrs = filter(None, ('attr') or '').split('.'))
79 return functools.reduce(getattr, attrs, module)
~/anaconda3/envs/ml/lib/python3.8/importlib/ in import_module(name, package)
125 break
126 level += 1
--> 127 return _bootstrap._gcd_import(name[level:], package, level)
~/anaconda3/envs/ml/lib/python3.8/importlib/ in _gcd_import(name, package, level)
~/anaconda3/envs/ml/lib/python3.8/importlib/ in _find_and_load(name, import_)
~/anaconda3/envs/ml/lib/python3.8/importlib/ in _find_and_load_unlocked(name, import_)
~/anaconda3/envs/ml/lib/python3.8/importlib/ in _call_with_frames_removed(f, *args, **kwds)
~/anaconda3/envs/ml/lib/python3.8/importlib/ in _gcd_import(name, package, level)
~/anaconda3/envs/ml/lib/python3.8/importlib/ in _find_and_load(name, import_)
~/anaconda3/envs/ml/lib/python3.8/importlib/ in _find_and_load_unlocked(name, import_)
~/anaconda3/envs/ml/lib/python3.8/importlib/ in _load_unlocked(spec)
~/anaconda3/envs/ml/lib/python3.8/importlib/ in exec_module(self, module)
~/anaconda3/envs/ml/lib/python3.8/importlib/ in _call_with_frames_removed(f, *args, **kwds)
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy_transformers/ in <module>
----> 1 from . import architectures
2 from . import annotation_setters
3 from . import span_getters
4 from .layers import TransformerModel
5 from .pipeline_component import Transformer, install_extensions
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy_transformers/ in <module>
3 from thinc.types import Ragged, Floats2d
4 from spacy.tokens import Doc
----> 5 from .layers import TransformerModel, TransformerListener
6 from .layers import trfs2arrays, split_trf_batch
7 from .util import registry
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy_transformers/layers/ in <module>
----> 1 from .listener import TransformerListener
2 from .transformer_model import TransformerModel
3 from .split_trf import split_trf_batch
4 from .trfs2arrays import trfs2arrays
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy_transformers/layers/ in <module>
2 from thinc.api import Model
3 from spacy.tokens import Doc
----> 4 from ..data_classes import TransformerData
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy_transformers/ in <module>
9 import srsly
---> 11 from .util import transpose_list
12 from .align import get_token_positions
~/anaconda3/envs/ml/lib/python3.8/site-packages/spacy_transformers/ in <module>
2 from pathlib import Path
3 import random
----> 4 from transformers import AutoModel, AutoTokenizer
5 from transformers.tokenization_utils import BatchEncoding
6 from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
~/anaconda3/envs/ml/lib/python3.8/site-packages/transformers/ in <module>
625 # Trainer
--> 626 from .trainer import Trainer
627 from .trainer_pt_utils import torch_distributed_zero_first
628 else:
~/anaconda3/envs/ml/lib/python3.8/site-packages/transformers/ in <module>
67 TrainerState,
68 )
---> 69 from .trainer_pt_utils import (
70 DistributedTensorGatherer,
71 SequentialDistributedSampler,
~/anaconda3/envs/ml/lib/python3.8/site-packages/transformers/ in <module>
39 else:
---> 40 from torch.optim.lr_scheduler import SAVE_STATE_WARNING
42 logger = logging.get_logger(__name__)
ImportError: cannot import name 'SAVE_STATE_WARNING' from 'torch.optim.lr_scheduler' (/Users/baconbaker/anaconda3/envs/ml/lib/python3.8/site-packages/torch/optim/
Reverting to torch 1.4.0 from the current stable release 1.8.1 solves the problem, but I don't want to do so.
Is there an alternative solution?
It looks like this is fixed in newer versions of transformers ( Try upgrading both transformers and spacy-transformers.

networkx maximum_flow crashes on some pairs of nodes

I have a graph composed of 742 edges, and 360 nodes.
I want to compute max flow between some pairs of nodes and it happens, for some of them the nx.maximum_flow ends with the pasted error, despite the fact that a path exists between the two concerned nodes.
Any idea what causes that?
ValueError Traceback (most recent call last)
<ipython-input-186-6dae3501e3d0> in <module>()
1 #print(nx.shortest_path(G,source="Sink_0",target="node_32"))
----> 2 nx.maximum_flow(G, "Sink_0", "Aircraft2_32")
/Library/Python/2.7/site-packages/networkx/algorithms/flow/maxflow.pyc in maximum_flow(G, s, t, capacity, flow_func, **kwargs)
156 raise nx.NetworkXError("flow_func has to be callable.")
--> 158 R = flow_func(G, s, t, capacity=capacity, value_only=False, **kwargs)
159 flow_dict = build_flow_dict(G, R)
/Library/Python/2.7/site-packages/networkx/algorithms/flow/preflowpush.pyc in preflow_push(G, s, t, capacity, residual, global_relabel_freq, value_only)
420 """
421 R = preflow_push_impl(G, s, t, capacity, residual, global_relabel_freq,
--> 422 value_only)
423 R.graph['algorithm'] = 'preflow_push'
424 return R
/Library/Python/2.7/site-packages/networkx/algorithms/flow/preflowpush.pyc in preflow_push_impl(G, s, t, capacity, residual, global_relabel_freq, value_only)
279 break
280 u = next(iter(
--> 281 height = discharge(u, False)
282 if grt.is_reached():
283 # Global relabeling heuristic.
/Library/Python/2.7/site-packages/networkx/algorithms/flow/preflowpush.pyc in discharge(u, is_phase1)
156 # We have run off the end of the adjacency list, and there can
157 # be no more admissible edges. Relabel the node to create one.
--> 158 height = relabel(u)
159 if is_phase1 and height >= n - 1:
160 # Although the node is still active, with a height at least
/Library/Python/2.7/site-packages/networkx/algorithms/flow/preflowpush.pyc in relabel(u)
125 """
126 grt.add_work(len(R_succ[u]))
--> 127 return min(R_node[v]['height'] for v, attr in R_succ[u].items()
128 if attr['flow'] < attr['capacity']) + 1
ValueError: min() arg is an empty sequence

How can use sqlContext in spark udf

I'm trying to load a json file in spark UDF and use it to query something. What i need to do is to do use a column value (storeId) from a data frame and use it in
But, i get a pickle error. If I try coding without the sqlContext then it work.
Is there any workaround or is this not possible.
def get_id_udf (storeId,sqlContext):
df ="file_url_s3")
if storeId == None:
return None
return None
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType
desc_udf = udf(lambda storeId : get_id_udf(storeId,sqlContext), IntegerType())
TypeError Traceback (most recent call last)
<ipython-input-22-b5c4070c110e> in <module>()
1 from pyspark.sql.functions import udf, col
2 from pyspark.sql.types import IntegerType
----> 3 desc_udf = udf(lambda storeId : get_cluster_id_udf(storeId,sqlContext), IntegerType())
/usr/lib/spark/python/pyspark/sql/ in udf(f, returnType)
1799 [Row(slen=5), Row(slen=3)]
1800 """
-> 1801 return UserDefinedFunction(f, returnType)
1803 blacklist = ['map', 'since', 'ignore_unicode_prefix']
/usr/lib/spark/python/pyspark/sql/ in __init__(self, func, returnType, name)
1758 self.returnType = returnType
1759 self._broadcast = None
-> 1760 self._judf = self._create_judf(name)
1762 def _create_judf(self, name):
/usr/lib/spark/python/pyspark/sql/ in _create_judf(self, name)
1763 from pyspark.sql import SQLContext
1764 sc = SparkContext.getOrCreate()
-> 1765 wrapped_func = _wrap_function(sc, self.func, self.returnType)
1766 ctx = SQLContext.getOrCreate(sc)
1767 jdt = ctx._ssql_ctx.parseDataType(self.returnType.json())
/usr/lib/spark/python/pyspark/sql/ in _wrap_function(sc, func, returnType)
1743 def _wrap_function(sc, func, returnType):
1744 command = (func, returnType)
-> 1745 pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
1746 return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes,
1747 sc.pythonVer, broadcast_vars, sc._javaAccumulator)
/usr/lib/spark/python/pyspark/ in _prepare_for_python_RDD(sc, command)
2313 # the serialized command will be compressed by broadcast
2314 ser = CloudPickleSerializer()
-> 2315 pickled_command = ser.dumps(command)
2316 if len(pickled_command) > (1 << 20): # 1M
2317 # The broadcast will have same life cycle as created PythonRDD
/usr/lib/spark/python/pyspark/ in dumps(self, obj)
427 def dumps(self, obj):
--> 428 return cloudpickle.dumps(obj, 2)
/usr/lib/spark/python/pyspark/ in dumps(obj, protocol)
656 cp = CloudPickler(file,protocol)
--> 657 cp.dump(obj)
659 return file.getvalue()
/usr/lib/spark/python/pyspark/ in dump(self, obj)
105 self.inject_addons()
106 try:
--> 107 return Pickler.dump(self, obj)
108 except RuntimeError as e:
109 if 'recursion' in e.args[0]:
/usr/lib64/python2.7/pickle.pyc in dump(self, obj)
222 if self.proto >= 2:
223 self.write(PROTO + chr(self.proto))
--> 224
225 self.write(STOP)
/usr/lib64/python2.7/pickle.pyc in save(self, obj)
284 f = self.dispatch.get(t)
285 if f:
--> 286 f(self, obj) # Call unbound method with explicit self
287 return
/usr/lib64/python2.7/pickle.pyc in save_dict(self, obj)
654 self.memoize(obj)
--> 655 self._batch_setitems(obj.iteritems())
657 dispatch[DictionaryType] = save_dict
/usr/lib64/python2.7/pickle.pyc in _batch_setitems(self, items)
685 for k, v in tmp:
686 save(k)
--> 687 save(v)
688 write(SETITEMS)
689 elif n:
/usr/lib64/python2.7/pickle.pyc in save(self, obj)
304 reduce = getattr(obj, "__reduce_ex__", None)
305 if reduce:
--> 306 rv = reduce(self.proto)
307 else:
308 reduce = getattr(obj, "__reduce__", None)
TypeError: 'JavaPackage' object is not callable

Applying scipy.sparse.linalg.svds throws a Memory Error?

I try to decompose a sparse matrix(40,000×1,400,000) with scipy.sparse.linalg.svds on my 64-bit machine with 140GB RAM. as following:
k = 5000
tfidf_mtx = tfidf_m.tocsr()
u_45,s_45,vT_45 = scipy.sparse.linalg.svds(tfidf_mtx, k=k)
When the K ranges from 1000 to 4500, it works. But the K is 5000, it throws an MemoryError.The precise error is given below:
MemoryError Traceback (most recent call last)
<ipython-input-6-31a69ce54e2c> in <module>()
4 k = 4000
5 tfidf_mtx = tfidf_m.tocsr()
----> 6 get_ipython().magic(u'time u_50,s_50,vT_50 =linalg.svds(tfidf_mtx, k=k))
7 # print len(s),s
/usr/lib/python2.7/dist-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
2163 magic_name, _, magic_arg_s = arg_s.partition(' ')
2164 magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2165 return self.run_line_magic(magic_name, magic_arg_s)
2167 #-------------------------------------------------------------------------
/usr/lib/python2.7/dist-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
2084 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
2085 with self.builtin_trap:
-> 2086 result = fn(*args,**kwargs)
2087 return result
/usr/lib/python2.7/dist-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
/usr/lib/python2.7/dist-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
189 # but it's overkill for just that one bit of state.
190 def magic_deco(arg):
--> 191 call = lambda f, *a, **k: f(*a, **k)
193 if callable(arg):
/usr/lib/python2.7/dist-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
1043 else:
1044 st = clock2()
-> 1045 exec code in glob, local_ns
1046 end = clock2()
1047 out = None
<timed exec> in <module>()
/usr/local/lib/python2.7/dist-packages/scipy/sparse/linalg/eigen/arpack/arpack.pyc in svds(A, k, ncv, tol, which, v0, maxiter, return_singular_vectors)
1751 else:
1752 ularge = eigvec[:, above_cutoff]
-> 1753 vhlarge = _herm(X_matmat(ularge) / slarge)
1755 u = _augmented_orthonormal_cols(ularge, nsmall)
/usr/local/lib/python2.7/dist-packages/scipy/sparse/base.pyc in dot(self, other)
245 """
--> 246 return self * other
248 def __eq__(self, other):
/usr/local/lib/python2.7/dist-packages/scipy/sparse/base.pyc in __mul__(self, other)
298 return self._mul_vector(other.ravel()).reshape(M, 1)
299 elif other.ndim == 2 and other.shape[0] == N:
--> 300 return self._mul_multivector(other)
302 if isscalarlike(other):
/usr/local/lib/python2.7/dist-packages/scipy/sparse/compressed.pyc in _mul_multivector(self, other)
464 result = np.zeros((M,n_vecs), dtype=upcast_char(self.dtype.char,
--> 465 other.dtype.char))
467 # csr_matvecs or csc_matvecs
The when the k is 3000 and 4500, the ratio of the sum of the square of singular values to the sum of the square of all matrix entities is respectively 0.7033 and 0.8230. I am searching for a long time on net. But no use. Please help or try to give some ideas how to achieve this.
So the return is an (M,k) array. On an ordinary older machine:
In [368]: np.ones((40000,1000))
In [369]: np.ones((40000,4000))
In [370]: np.ones((40000,5000))
--> 190 a = empty(shape, dtype, order)
191 multiarray.copyto(a, 1, casting='unsafe')
192 return a
Now may just be a coincidence that I hit the memory error at the same size are your code. But if you make the problem big enough you will hit memory errors at some point.
Your stacktrace shows the error occurs while multiplying a sparse matrix and a dense 2d array (other), and the result will be dense as well.

pyspark in Ipython notebook raises Py4JNetworkError

I was using IPython notebook to run PySpark with just adding the following to the notebook:
import os
import sys
import pandas as pd
%pylab inline
from IPython.display import Image
sys.path.append( os.path.join(os.environ['SPARK_HOME'], 'python') )
sys.path.append( os.path.join(os.environ['SPARK_HOME'], 'bin') )
sys.path.append( os.path.join(os.environ['SPARK_HOME'], 'python/lib/') )
from pyspark import SparkContext
sc = SparkContext('local')
This worked fine for one project. but on my second project, after running a couple of lines (not the same every time), I get the following error:
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/py4j-", line 425, in start
self.socket.connect((self.address, self.port))
File "/usr/lib/python2.7/", line 224, in meth
return getattr(self._sock,name)(*args)
error: [Errno 111] Connection refused
Py4JNetworkError Traceback (most recent call last)
<ipython-input-21-4626925bbe8f> in <module>()
----> 1 words.count()
/home/eee/Desktop/NLP/spark-1.3.1-bin-hadoop2.6/python/pyspark/rdd.pyc in count(self)
930 3
931 """
--> 932 return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
934 def stats(self):
/home/eee/Desktop/NLP/spark-1.3.1-bin-hadoop2.6/python/pyspark/rdd.pyc in sum(self)
921 6.0
922 """
--> 923 return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
925 def count(self):
/home/eee/Desktop/NLP/spark-1.3.1-bin-hadoop2.6/python/pyspark/rdd.pyc in reduce(self, f)
737 yield reduce(f, iterator, initial)
--> 739 vals = self.mapPartitions(func).collect()
740 if vals:
741 return reduce(f, vals)
/home/eee/Desktop/NLP/spark-1.3.1-bin-hadoop2.6/python/pyspark/rdd.pyc in collect(self)
710 Return a list that contains all of the elements in this RDD.
711 """
--> 712 with SCCallSiteSync(self.context) as css:
713 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
714 return list(_load_from_socket(port, self._jrdd_deserializer))
/home/eee/Desktop/NLP/spark-1.3.1-bin-hadoop2.6/python/pyspark/traceback_utils.pyc in __enter__(self)
70 def __enter__(self):
71 if SCCallSiteSync._spark_stack_depth == 0:
---> 72 self._context._jsc.setCallSite(self._call_site)
73 SCCallSiteSync._spark_stack_depth += 1
/usr/local/lib/python2.7/dist-packages/py4j- in __call__(self, *args)
--> 536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
538 self.target_id,
/usr/local/lib/python2.7/dist-packages/py4j- in send_command(self, command, retry)
360 the Py4J protocol.
361 """
--> 362 connection = self._get_connection()
363 try:
364 response = connection.send_command(command)
/usr/local/lib/python2.7/dist-packages/py4j- in _get_connection(self)
316 connection = self.deque.pop()
317 except Exception:
--> 318 connection = self._create_connection()
319 return connection
/usr/local/lib/python2.7/dist-packages/py4j- in _create_connection(self)
323 connection = GatewayConnection(self.address, self.port,
324 self.auto_close, self.gateway_property)
--> 325 connection.start()
326 return connection
/usr/local/lib/python2.7/dist-packages/py4j- in start(self)
430 'server'
431 logger.exception(msg)
--> 432 raise Py4JNetworkError(msg)
434 def close(self):
Py4JNetworkError: An error occurred while trying to connect to the Java server
Once this happens, other lines working before now raise the same problem,
any ideas?
Specifications for:
pyspark 1.4.1
ipython 4.0.0
[OSX / homebrew]
If you want to launch pyspark within a Jupyter (ex-iPython) Notebook using the iPython kernel, I advise you to launch your notebook directly with the pyspark command:
But in order to do that, you need to add three lines in your bash .profile or zsh .zshrc profile to set these environment variables:
export SPARK_HOME=/path/to/apache-spark/1.4.1/libexec
export PYSPARK_DRIVER_PYTHON=ipython2 # remember that Apache-Spark only works with pyhton2.7
In my case, given that I'm on OSX , an installed apache-spark with Homebrew, this is:
export SPARK_HOME=/usr/local/Cellar/apache-spark/1.4.1/libexec
Then, when you execute the command 'pyspark' in your terminal, your terminal will automatically open a Jupyter (ex-iPython) notebook in your default Browser.
I 17:51:00.209 NotebookApp] Serving notebooks from local directory: /Users/Thibault/code/kaggle
[I 17:51:00.209 NotebookApp] 0 active kernels
[I 17:51:00.210 NotebookApp] The IPython Notebook is running at: http://localhost:42424/
[I 17:51:00.210 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
[I 17:51:11.980 NotebookApp] Kernel started: 53ad11b1-4fa4-459d-804c-0487036b0f29
15/09/02 17:51:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable