Navigation

Source code for pyspark.sql.dataframe

## Licensed to the Apache Software Foundation (ASF) under one or more# contributor license agreements. See the NOTICE file distributed with# this work for additional information regarding copyright ownership.# The ASF licenses this file to You under the Apache License, Version 2.0# (the "License"); you may not use this file except in compliance with# the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.#importsysimportrandomifsys.version>='3':basestring=unicode=strlong=intfromfunctoolsimportreduceelse:fromitertoolsimportimapasmapimportwarningsfrompysparkimportcopy_func,since,_NoValuefrompyspark.rddimportRDD,_load_from_socket,ignore_unicode_prefixfrompyspark.serializersimportArrowSerializer,BatchedSerializer,PickleSerializer, \
UTF8Deserializerfrompyspark.storagelevelimportStorageLevelfrompyspark.traceback_utilsimportSCCallSiteSyncfrompyspark.sql.typesimport_parse_datatype_json_stringfrompyspark.sql.columnimportColumn,_to_seq,_to_list,_to_java_columnfrompyspark.sql.readwriterimportDataFrameWriterfrompyspark.sql.streamingimportDataStreamWriterfrompyspark.sql.typesimportIntegralTypefrompyspark.sql.typesimport*frompyspark.utilimport_exception_message__all__=["DataFrame","DataFrameNaFunctions","DataFrameStatFunctions"]

[docs]classDataFrame(object):"""A distributed collection of data grouped into named columns. A :class:`DataFrame` is equivalent to a relational table in Spark SQL, and can be created using various functions in :class:`SparkSession`:: people = spark.read.parquet("...") Once created, it can be manipulated using the various domain-specific-language (DSL) functions defined in: :class:`DataFrame`, :class:`Column`. To select a column from the data frame, use the apply method:: ageCol = people.age A more concrete example:: # To create DataFrame using SparkSession people = spark.read.parquet("...") department = spark.read.parquet("...") people.filter(people.age > 30).join(department, people.deptId == department.id) \\ .groupBy(department.name, "gender").agg({"salary": "avg", "age": "max"}) .. versionadded:: 1.3 """def__init__(self,jdf,sql_ctx):self._jdf=jdfself.sql_ctx=sql_ctxself._sc=sql_ctxandsql_ctx._scself.is_cached=Falseself._schema=None# initialized lazilyself._lazy_rdd=None@property@since(1.3)defrdd(self):"""Returns the content as an :class:`pyspark.RDD` of :class:`Row`. """ifself._lazy_rddisNone:jrdd=self._jdf.javaToPython()self._lazy_rdd=RDD(jrdd,self.sql_ctx._sc,BatchedSerializer(PickleSerializer()))returnself._lazy_rdd@property@since("1.3.1")defna(self):"""Returns a :class:`DataFrameNaFunctions` for handling missing values. """returnDataFrameNaFunctions(self)@property@since(1.4)defstat(self):"""Returns a :class:`DataFrameStatFunctions` for statistic functions. """returnDataFrameStatFunctions(self)

[docs]@ignore_unicode_prefix@since(1.3)deftoJSON(self,use_unicode=True):"""Converts a :class:`DataFrame` into a :class:`RDD` of string. Each row is turned into a JSON document as one element in the returned RDD. >>> df.toJSON().first() u'{"age":2,"name":"Alice"}' """rdd=self._jdf.toJSON()returnRDD(rdd.toJavaRDD(),self._sc,UTF8Deserializer(use_unicode))

[docs]@since(1.3)defregisterTempTable(self,name):"""Registers this DataFrame as a temporary table using the given name. The lifetime of this temporary table is tied to the :class:`SparkSession` that was used to create this :class:`DataFrame`. >>> df.registerTempTable("people") >>> df2 = spark.sql("select * from people") >>> sorted(df.collect()) == sorted(df2.collect()) True >>> spark.catalog.dropTempView("people") .. note:: Deprecated in 2.0, use createOrReplaceTempView instead. """warnings.warn("Deprecated in 2.0, use createOrReplaceTempView instead.",DeprecationWarning)self._jdf.createOrReplaceTempView(name)

[docs]@since(2.0)defcreateOrReplaceTempView(self,name):"""Creates or replaces a local temporary view with this DataFrame. The lifetime of this temporary table is tied to the :class:`SparkSession` that was used to create this :class:`DataFrame`. >>> df.createOrReplaceTempView("people") >>> df2 = df.filter(df.age > 3) >>> df2.createOrReplaceTempView("people") >>> df3 = spark.sql("select * from people") >>> sorted(df3.collect()) == sorted(df2.collect()) True >>> spark.catalog.dropTempView("people") """self._jdf.createOrReplaceTempView(name)

@property@since(1.4)defwrite(self):""" Interface for saving the content of the non-streaming :class:`DataFrame` out into external storage. :return: :class:`DataFrameWriter` """returnDataFrameWriter(self)@property@since(2.0)defwriteStream(self):""" Interface for saving the content of the streaming :class:`DataFrame` out into external storage. .. note:: Evolving. :return: :class:`DataStreamWriter` """returnDataStreamWriter(self)@property@since(1.3)defschema(self):"""Returns the schema of this :class:`DataFrame` as a :class:`pyspark.sql.types.StructType`. >>> df.schema StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true))) """ifself._schemaisNone:try:self._schema=_parse_datatype_json_string(self._jdf.schema().json())exceptAttributeErrorase:raiseException("Unable to parse datatype from schema. %s"%e)returnself._schema

[docs]@since(1.3)defisLocal(self):"""Returns ``True`` if the :func:`collect` and :func:`take` methods can be run locally (without any Spark executors). """returnself._jdf.isLocal()

@property@since(2.0)defisStreaming(self):"""Returns true if this :class:`Dataset` contains one or more sources that continuously return data as it arrives. A :class:`Dataset` that reads data from a streaming source must be executed as a :class:`StreamingQuery` using the :func:`start` method in :class:`DataStreamWriter`. Methods that return a single answer, (e.g., :func:`count` or :func:`collect`) will throw an :class:`AnalysisException` when there is a streaming source present. .. note:: Evolving """returnself._jdf.isStreaming()

[docs]@since(2.1)defcheckpoint(self,eager=True):"""Returns a checkpointed version of this Dataset. Checkpointing can be used to truncate the logical plan of this DataFrame, which is especially useful in iterative algorithms where the plan may grow exponentially. It will be saved to files inside the checkpoint directory set with L{SparkContext.setCheckpointDir()}. :param eager: Whether to checkpoint this DataFrame immediately .. note:: Experimental """jdf=self._jdf.checkpoint(eager)returnDataFrame(jdf,self.sql_ctx)

[docs]@since(2.3)deflocalCheckpoint(self,eager=True):"""Returns a locally checkpointed version of this Dataset. Checkpointing can be used to truncate the logical plan of this DataFrame, which is especially useful in iterative algorithms where the plan may grow exponentially. Local checkpoints are stored in the executors using the caching subsystem and therefore they are not reliable. :param eager: Whether to checkpoint this DataFrame immediately .. note:: Experimental """jdf=self._jdf.localCheckpoint(eager)returnDataFrame(jdf,self.sql_ctx)

[docs]@since(2.1)defwithWatermark(self,eventTime,delayThreshold):"""Defines an event time watermark for this :class:`DataFrame`. A watermark tracks a point in time before which we assume no more late data is going to arrive. Spark will use this watermark for several purposes: - To know when a given time window aggregation can be finalized and thus can be emitted when using output modes that do not allow updates. - To minimize the amount of state that we need to keep for on-going aggregations. The current watermark is computed by looking at the `MAX(eventTime)` seen across all of the partitions in the query minus a user specified `delayThreshold`. Due to the cost of coordinating this value across partitions, the actual watermark used is only guaranteed to be at least `delayThreshold` behind the actual event time. In some cases we may still process records that arrive more than `delayThreshold` late. :param eventTime: the name of the column that contains the event time of the row. :param delayThreshold: the minimum delay to wait to data to arrive late, relative to the latest record that has been processed in the form of an interval (e.g. "1 minute" or "5 hours"). .. note:: Evolving >>> sdf.select('name', sdf.time.cast('timestamp')).withWatermark('time', '10 minutes') DataFrame[name: string, time: timestamp] """ifnoteventTimeortype(eventTime)isnotstr:raiseTypeError("eventTime should be provided as a string")ifnotdelayThresholdortype(delayThreshold)isnotstr:raiseTypeError("delayThreshold should be provided as a string interval")jdf=self._jdf.withWatermark(eventTime,delayThreshold)returnDataFrame(jdf,self.sql_ctx)

[docs]@since(1.3)defcount(self):"""Returns the number of rows in this :class:`DataFrame`. >>> df.count() 2 """returnint(self._jdf.count())

[docs]@ignore_unicode_prefix@since(1.3)defcollect(self):"""Returns all the records as a list of :class:`Row`. >>> df.collect() [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] """withSCCallSiteSync(self._sc)ascss:port=self._jdf.collectToPython()returnlist(_load_from_socket(port,BatchedSerializer(PickleSerializer())))

[docs]@ignore_unicode_prefix@since(2.0)deftoLocalIterator(self):""" Returns an iterator that contains all of the rows in this :class:`DataFrame`. The iterator will consume as much memory as the largest partition in this DataFrame. >>> list(df.toLocalIterator()) [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] """withSCCallSiteSync(self._sc)ascss:port=self._jdf.toPythonIterator()return_load_from_socket(port,BatchedSerializer(PickleSerializer()))

[docs]@ignore_unicode_prefix@since(1.3)deflimit(self,num):"""Limits the result count to the number specified. >>> df.limit(1).collect() [Row(age=2, name=u'Alice')] >>> df.limit(0).collect() [] """jdf=self._jdf.limit(num)returnDataFrame(jdf,self.sql_ctx)

[docs]@since(1.3)defforeach(self,f):"""Applies the ``f`` function to all :class:`Row` of this :class:`DataFrame`. This is a shorthand for ``df.rdd.foreach()``. >>> def f(person): ... print(person.name) >>> df.foreach(f) """self.rdd.foreach(f)

[docs]@since(1.3)defforeachPartition(self,f):"""Applies the ``f`` function to each partition of this :class:`DataFrame`. This a shorthand for ``df.rdd.foreachPartition()``. >>> def f(people): ... for person in people: ... print(person.name) >>> df.foreachPartition(f) """self.rdd.foreachPartition(f)

[docs]@since(1.3)defcache(self):"""Persists the :class:`DataFrame` with the default storage level (C{MEMORY_AND_DISK}). .. note:: The default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0. """self.is_cached=Trueself._jdf.cache()returnself

[docs]@since(1.3)defpersist(self,storageLevel=StorageLevel.MEMORY_AND_DISK):"""Sets the storage level to persist the contents of the :class:`DataFrame` across operations after the first time it is computed. This can only be used to assign a new storage level if the :class:`DataFrame` does not have a storage level set yet. If no storage level is specified defaults to (C{MEMORY_AND_DISK}). .. note:: The default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0. """self.is_cached=TruejavaStorageLevel=self._sc._getJavaStorageLevel(storageLevel)self._jdf.persist(javaStorageLevel)returnself

[docs]@since(1.3)defunpersist(self,blocking=False):"""Marks the :class:`DataFrame` as non-persistent, and remove all blocks for it from memory and disk. .. note:: `blocking` default has changed to False to match Scala in 2.0. """self.is_cached=Falseself._jdf.unpersist(blocking)returnself

[docs]@since(1.4)defcoalesce(self,numPartitions):""" Returns a new :class:`DataFrame` that has exactly `numPartitions` partitions. Similar to coalesce defined on an :class:`RDD`, this operation results in a narrow dependency, e.g. if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of the 100 new partitions will claim 10 of the current partitions. If a larger number of partitions is requested, it will stay at the current number of partitions. However, if you're doing a drastic coalesce, e.g. to numPartitions = 1, this may result in your computation taking place on fewer nodes than you like (e.g. one node in the case of numPartitions = 1). To avoid this, you can call repartition(). This will add a shuffle step, but means the current upstream partitions will be executed in parallel (per whatever the current partitioning is). >>> df.coalesce(1).rdd.getNumPartitions() 1 """returnDataFrame(self._jdf.coalesce(numPartitions),self.sql_ctx)

[docs]@since(1.3)defdistinct(self):"""Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`. >>> df.distinct().count() 2 """returnDataFrame(self._jdf.distinct(),self.sql_ctx)

[docs]@since(1.5)defsampleBy(self,col,fractions,seed=None):""" Returns a stratified sample without replacement based on the fraction given on each stratum. :param col: column that defines strata :param fractions: sampling fraction for each stratum. If a stratum is not specified, we treat its fraction as zero. :param seed: random seed :return: a new DataFrame that represents the stratified sample >>> from pyspark.sql.functions import col >>> dataset = sqlContext.range(0, 100).select((col("id") % 3).alias("key")) >>> sampled = dataset.sampleBy("key", fractions={0: 0.1, 1: 0.2}, seed=0) >>> sampled.groupBy("key").count().orderBy("key").show() +---+-----+ |key|count| +---+-----+ | 0| 5| | 1| 9| +---+-----+ """ifnotisinstance(col,basestring):raiseValueError("col must be a string, but got %r"%type(col))ifnotisinstance(fractions,dict):raiseValueError("fractions must be a dict but got %r"%type(fractions))fork,vinfractions.items():ifnotisinstance(k,(float,int,long,basestring)):raiseValueError("key must be float, int, long, or string, but got %r"%type(k))fractions[k]=float(v)seed=seedifseedisnotNoneelserandom.randint(0,sys.maxsize)returnDataFrame(self._jdf.stat().sampleBy(col,self._jmap(fractions),seed),self.sql_ctx)

[docs]@since(1.4)defrandomSplit(self,weights,seed=None):"""Randomly splits this :class:`DataFrame` with the provided weights. :param weights: list of doubles as weights with which to split the DataFrame. Weights will be normalized if they don't sum up to 1.0. :param seed: The seed for sampling. >>> splits = df4.randomSplit([1.0, 2.0], 24) >>> splits[0].count() 1 >>> splits[1].count() 3 """forwinweights:ifw<0.0:raiseValueError("Weights must be positive. Found weight value: %s"%w)seed=seedifseedisnotNoneelserandom.randint(0,sys.maxsize)rdd_array=self._jdf.randomSplit(_to_list(self.sql_ctx._sc,weights),long(seed))return[DataFrame(rdd,self.sql_ctx)forrddinrdd_array]

orderBy=sortdef_jseq(self,cols,converter=None):"""Return a JVM Seq of Columns from a list of Column or names"""return_to_seq(self.sql_ctx._sc,cols,converter)def_jmap(self,jm):"""Return a JVM Scala Map from a dict"""return_to_scala_map(self.sql_ctx._sc,jm)def_jcols(self,*cols):"""Return a JVM Seq of Columns from a list of Column or column names If `cols` has only one list in it, cols[0] will be used as the list. """iflen(cols)==1andisinstance(cols[0],list):cols=cols[0]returnself._jseq(cols,_to_java_column)def_sort_cols(self,cols,kwargs):""" Return a JVM Seq of Columns that describes the sort order """ifnotcols:raiseValueError("should sort by at least one column")iflen(cols)==1andisinstance(cols[0],list):cols=cols[0]jcols=[_to_java_column(c)forcincols]ascending=kwargs.get('ascending',True)ifisinstance(ascending,(bool,int)):ifnotascending:jcols=[jc.desc()forjcinjcols]elifisinstance(ascending,list):jcols=[jcifascelsejc.desc()forasc,jcinzip(ascending,jcols)]else:raiseTypeError("ascending can only be boolean or list, but got %s"%type(ascending))returnself._jseq(jcols)

[docs]@ignore_unicode_prefix@since(1.3)defhead(self,n=None):"""Returns the first ``n`` rows. .. note:: This method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory. :param n: int, default 1. Number of rows to return. :return: If n is greater than 1, return a list of :class:`Row`. If n is 1, return a single Row. >>> df.head() Row(age=2, name=u'Alice') >>> df.head(1) [Row(age=2, name=u'Alice')] """ifnisNone:rs=self.head(1)returnrs[0]ifrselseNonereturnself.take(n)

[docs]@ignore_unicode_prefix@since(1.3)deffirst(self):"""Returns the first row as a :class:`Row`. >>> df.first() Row(age=2, name=u'Alice') """returnself.head()

[docs]@ignore_unicode_prefix@since(1.3)defselect(self,*cols):"""Projects a set of expressions and returns a new :class:`DataFrame`. :param cols: list of column names (string) or expressions (:class:`Column`). If one of the column names is '*', that column is expanded to include all columns in the current DataFrame. >>> df.select('*').collect() [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] >>> df.select('name', 'age').collect() [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)] >>> df.select(df.name, (df.age + 10).alias('age')).collect() [Row(name=u'Alice', age=12), Row(name=u'Bob', age=15)] """jdf=self._jdf.select(self._jcols(*cols))returnDataFrame(jdf,self.sql_ctx)

[docs]@since(1.3)defselectExpr(self,*expr):"""Projects a set of SQL expressions and returns a new :class:`DataFrame`. This is a variant of :func:`select` that accepts SQL expressions. >>> df.selectExpr("age * 2", "abs(age)").collect() [Row((age * 2)=4, abs(age)=2), Row((age * 2)=10, abs(age)=5)] """iflen(expr)==1andisinstance(expr[0],list):expr=expr[0]jdf=self._jdf.selectExpr(self._jseq(expr))returnDataFrame(jdf,self.sql_ctx)

[docs]@ignore_unicode_prefix@since(1.3)defgroupBy(self,*cols):"""Groups the :class:`DataFrame` using the specified columns, so we can run aggregation on them. See :class:`GroupedData` for all the available aggregate functions. :func:`groupby` is an alias for :func:`groupBy`. :param cols: list of columns to group by. Each element should be a column name (string) or an expression (:class:`Column`). >>> df.groupBy().avg().collect() [Row(avg(age)=3.5)] >>> sorted(df.groupBy('name').agg({'age': 'mean'}).collect()) [Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)] >>> sorted(df.groupBy(df.name).avg().collect()) [Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)] >>> sorted(df.groupBy(['name', df.age]).count().collect()) [Row(name=u'Alice', age=2, count=1), Row(name=u'Bob', age=5, count=1)] """jgd=self._jdf.groupBy(self._jcols(*cols))frompyspark.sql.groupimportGroupedDatareturnGroupedData(jgd,self)

[docs]@since(2.0)defunion(self,other):""" Return a new :class:`DataFrame` containing union of rows in this and another frame. This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union (that does deduplication of elements), use this function followed by :func:`distinct`. Also as standard in SQL, this function resolves columns by position (not by name). """returnDataFrame(self._jdf.union(other._jdf),self.sql_ctx)

[docs]@since(1.3)defunionAll(self,other):""" Return a new :class:`DataFrame` containing union of rows in this and another frame. This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union (that does deduplication of elements), use this function followed by :func:`distinct`. Also as standard in SQL, this function resolves columns by position (not by name). .. note:: Deprecated in 2.0, use :func:`union` instead. """warnings.warn("Deprecated in 2.0, use union instead.",DeprecationWarning)returnself.union(other)

[docs]@since(2.3)defunionByName(self,other):""" Returns a new :class:`DataFrame` containing union of rows in this and another frame. This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set union (that does deduplication of elements), use this function followed by :func:`distinct`. The difference between this function and :func:`union` is that this function resolves columns by name (not by position): >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"]) >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"]) >>> df1.unionByName(df2).show() +----+----+----+ |col0|col1|col2| +----+----+----+ | 1| 2| 3| | 6| 4| 5| +----+----+----+ """returnDataFrame(self._jdf.unionByName(other._jdf),self.sql_ctx)

[docs]@since(1.3)defintersect(self,other):""" Return a new :class:`DataFrame` containing rows only in both this frame and another frame. This is equivalent to `INTERSECT` in SQL. """returnDataFrame(self._jdf.intersect(other._jdf),self.sql_ctx)

[docs]@since(1.3)defsubtract(self,other):""" Return a new :class:`DataFrame` containing rows in this frame but not in another frame. This is equivalent to `EXCEPT DISTINCT` in SQL. """returnDataFrame(getattr(self._jdf,"except")(other._jdf),self.sql_ctx)

[docs]@since(1.4)defdropDuplicates(self,subset=None):"""Return a new :class:`DataFrame` with duplicate rows removed, optionally only considering certain columns. For a static batch :class:`DataFrame`, it just drops duplicate rows. For a streaming :class:`DataFrame`, it will keep all data across triggers as intermediate state to drop duplicates rows. You can use :func:`withWatermark` to limit how late the duplicate data can be and system will accordingly limit the state. In addition, too late data older than watermark will be dropped to avoid any possibility of duplicates. :func:`drop_duplicates` is an alias for :func:`dropDuplicates`. >>> from pyspark.sql import Row >>> df = sc.parallelize([ \\ ... Row(name='Alice', age=5, height=80), \\ ... Row(name='Alice', age=5, height=80), \\ ... Row(name='Alice', age=10, height=80)]).toDF() >>> df.dropDuplicates().show() +---+------+-----+ |age|height| name| +---+------+-----+ | 5| 80|Alice| | 10| 80|Alice| +---+------+-----+ >>> df.dropDuplicates(['name', 'height']).show() +---+------+-----+ |age|height| name| +---+------+-----+ | 5| 80|Alice| +---+------+-----+ """ifsubsetisNone:jdf=self._jdf.dropDuplicates()else:jdf=self._jdf.dropDuplicates(self._jseq(subset))returnDataFrame(jdf,self.sql_ctx)

[docs]@since("1.3.1")defdropna(self,how='any',thresh=None,subset=None):"""Returns a new :class:`DataFrame` omitting rows with null values. :func:`DataFrame.dropna` and :func:`DataFrameNaFunctions.drop` are aliases of each other. :param how: 'any' or 'all'. If 'any', drop a row if it contains any nulls. If 'all', drop a row only if all its values are null. :param thresh: int, default None If specified, drop rows that have less than `thresh` non-null values. This overwrites the `how` parameter. :param subset: optional list of column names to consider. >>> df4.na.drop().show() +---+------+-----+ |age|height| name| +---+------+-----+ | 10| 80|Alice| +---+------+-----+ """ifhowisnotNoneandhownotin['any','all']:raiseValueError("how ('"+how+"') should be 'any' or 'all'")ifsubsetisNone:subset=self.columnselifisinstance(subset,basestring):subset=[subset]elifnotisinstance(subset,(list,tuple)):raiseValueError("subset should be a list or tuple of column names")ifthreshisNone:thresh=len(subset)ifhow=='any'else1returnDataFrame(self._jdf.na().drop(thresh,self._jseq(subset)),self.sql_ctx)

[docs]@since(1.4)defreplace(self,to_replace,value=_NoValue,subset=None):"""Returns a new :class:`DataFrame` replacing a value with another value. :func:`DataFrame.replace` and :func:`DataFrameNaFunctions.replace` are aliases of each other. Values to_replace and value must have the same type and can only be numerics, booleans, or strings. Value can have None. When replacing, the new value will be cast to the type of the existing column. For numeric replacements all values to be replaced should have unique floating point representation. In case of conflicts (for example with `{42: -1, 42.0: 1}`) and arbitrary replacement will be used. :param to_replace: bool, int, long, float, string, list or dict. Value to be replaced. If the value is a dict, then `value` is ignored or can be omitted, and `to_replace` must be a mapping between a value and a replacement. :param value: bool, int, long, float, string, list or None. The replacement value must be a bool, int, long, float, string or None. If `value` is a list, `value` should be of the same length and type as `to_replace`. If `value` is a scalar and `to_replace` is a sequence, then `value` is used as a replacement for each item in `to_replace`. :param subset: optional list of column names to consider. Columns specified in subset that do not have matching data type are ignored. For example, if `value` is a string, and subset contains a non-string column, then the non-string column is simply ignored. >>> df4.na.replace(10, 20).show() +----+------+-----+ | age|height| name| +----+------+-----+ | 20| 80|Alice| | 5| null| Bob| |null| null| Tom| |null| null| null| +----+------+-----+ >>> df4.na.replace('Alice', None).show() +----+------+----+ | age|height|name| +----+------+----+ | 10| 80|null| | 5| null| Bob| |null| null| Tom| |null| null|null| +----+------+----+ >>> df4.na.replace({'Alice': None}).show() +----+------+----+ | age|height|name| +----+------+----+ | 10| 80|null| | 5| null| Bob| |null| null| Tom| |null| null|null| +----+------+----+ >>> df4.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show() +----+------+----+ | age|height|name| +----+------+----+ | 10| 80| A| | 5| null| B| |null| null| Tom| |null| null|null| +----+------+----+ """ifvalueis_NoValue:ifisinstance(to_replace,dict):value=Noneelse:raiseTypeError("value argument is required when to_replace is not a dictionary.")# Helper functionsdefall_of(types):"""Given a type or tuple of types and a sequence of xs check if each x is instance of type(s) >>> all_of(bool)([True, False]) True >>> all_of(basestring)(["a", 1]) False """defall_of_(xs):returnall(isinstance(x,types)forxinxs)returnall_of_all_of_bool=all_of(bool)all_of_str=all_of(basestring)all_of_numeric=all_of((float,int,long))# Validate input typesvalid_types=(bool,float,int,long,basestring,list,tuple)ifnotisinstance(to_replace,valid_types+(dict,)):raiseValueError("to_replace should be a bool, float, int, long, string, list, tuple, or dict. ""Got {0}".format(type(to_replace)))ifnotisinstance(value,valid_types)andvalueisnotNone \
andnotisinstance(to_replace,dict):raiseValueError("If to_replace is not a dict, value should be ""a bool, float, int, long, string, list, tuple or None. ""Got {0}".format(type(value)))ifisinstance(to_replace,(list,tuple))andisinstance(value,(list,tuple)):iflen(to_replace)!=len(value):raiseValueError("to_replace and value lists should be of the same length. ""Got {0} and {1}".format(len(to_replace),len(value)))ifnot(subsetisNoneorisinstance(subset,(list,tuple,basestring))):raiseValueError("subset should be a list or tuple of column names, ""column name or None. Got {0}".format(type(subset)))# Reshape input arguments if necessaryifisinstance(to_replace,(float,int,long,basestring)):to_replace=[to_replace]ifisinstance(to_replace,dict):rep_dict=to_replaceifvalueisnotNone:warnings.warn("to_replace is a dict and value is not None. value will be ignored.")else:ifisinstance(value,(float,int,long,basestring))orvalueisNone:value=[valuefor_inrange(len(to_replace))]rep_dict=dict(zip(to_replace,value))ifisinstance(subset,basestring):subset=[subset]# Verify we were not passed in mixed type generics.ifnotany(all_of_type(rep_dict.keys())andall_of_type(xforxinrep_dict.values()ifxisnotNone)forall_of_typein[all_of_bool,all_of_str,all_of_numeric]):raiseValueError("Mixed type replacements are not supported")ifsubsetisNone:returnDataFrame(self._jdf.na().replace('*',rep_dict),self.sql_ctx)else:returnDataFrame(self._jdf.na().replace(self._jseq(subset),self._jmap(rep_dict)),self.sql_ctx)

[docs]@since(2.0)defapproxQuantile(self,col,probabilities,relativeError):""" Calculates the approximate quantiles of numerical columns of a DataFrame. The result of this algorithm has the following deterministic bound: If the DataFrame has N elements and if we request the quantile at probability `p` up to error `err`, then the algorithm will return a sample `x` from the DataFrame so that the *exact* rank of `x` is close to (p * N). More precisely, floor((p - err) * N) <= rank(x) <= ceil((p + err) * N). This method implements a variation of the Greenwald-Khanna algorithm (with some speed optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670 Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna. Note that null values will be ignored in numerical columns before calculation. For columns only containing null values, an empty list is returned. :param col: str, list. Can be a single column name, or a list of names for multiple columns. :param probabilities: a list of quantile probabilities Each number must belong to [0, 1]. For example 0 is the minimum, 0.5 is the median, 1 is the maximum. :param relativeError: The relative target precision to achieve (>= 0). If set to zero, the exact quantiles are computed, which could be very expensive. Note that values greater than 1 are accepted but give the same result as 1. :return: the approximate quantiles at the given probabilities. If the input `col` is a string, the output is a list of floats. If the input `col` is a list or tuple of strings, the output is also a list, but each element in it is a list of floats, i.e., the output is a list of list of floats. .. versionchanged:: 2.2 Added support for multiple columns. """ifnotisinstance(col,(basestring,list,tuple)):raiseValueError("col should be a string, list or tuple, but got %r"%type(col))isStr=isinstance(col,basestring)ifisinstance(col,tuple):col=list(col)elifisStr:col=[col]forcincol:ifnotisinstance(c,basestring):raiseValueError("columns should be strings, but got %r"%type(c))col=_to_list(self._sc,col)ifnotisinstance(probabilities,(list,tuple)):raiseValueError("probabilities should be a list or tuple")ifisinstance(probabilities,tuple):probabilities=list(probabilities)forpinprobabilities:ifnotisinstance(p,(float,int,long))orp<0orp>1:raiseValueError("probabilities should be numerical (float, int, long) in [0,1].")probabilities=_to_list(self._sc,probabilities)ifnotisinstance(relativeError,(float,int,long))orrelativeError<0:raiseValueError("relativeError should be numerical (float, int, long) >= 0.")relativeError=float(relativeError)jaq=self._jdf.stat().approxQuantile(col,probabilities,relativeError)jaq_list=[list(j)forjinjaq]returnjaq_list[0]ifisStrelsejaq_list

[docs]@since(1.4)defcorr(self,col1,col2,method=None):""" Calculates the correlation of two columns of a DataFrame as a double value. Currently only supports the Pearson Correlation Coefficient. :func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases of each other. :param col1: The name of the first column :param col2: The name of the second column :param method: The correlation method. Currently only supports "pearson" """ifnotisinstance(col1,basestring):raiseValueError("col1 should be a string.")ifnotisinstance(col2,basestring):raiseValueError("col2 should be a string.")ifnotmethod:method="pearson"ifnotmethod=="pearson":raiseValueError("Currently only the calculation of the Pearson Correlation "+"coefficient is supported.")returnself._jdf.stat().corr(col1,col2,method)

[docs]@since(1.4)defcov(self,col1,col2):""" Calculate the sample covariance for the given columns, specified by their names, as a double value. :func:`DataFrame.cov` and :func:`DataFrameStatFunctions.cov` are aliases. :param col1: The name of the first column :param col2: The name of the second column """ifnotisinstance(col1,basestring):raiseValueError("col1 should be a string.")ifnotisinstance(col2,basestring):raiseValueError("col2 should be a string.")returnself._jdf.stat().cov(col1,col2)

[docs]@since(1.4)defcrosstab(self,col1,col2):""" Computes a pair-wise frequency table of the given columns. Also known as a contingency table. The number of distinct values for each column should be less than 1e4. At most 1e6 non-zero pair frequencies will be returned. The first column of each row will be the distinct values of `col1` and the column names will be the distinct values of `col2`. The name of the first column will be `$col1_$col2`. Pairs that have no occurrences will have zero as their counts. :func:`DataFrame.crosstab` and :func:`DataFrameStatFunctions.crosstab` are aliases. :param col1: The name of the first column. Distinct items will make the first item of each row. :param col2: The name of the second column. Distinct items will make the column names of the DataFrame. """ifnotisinstance(col1,basestring):raiseValueError("col1 should be a string.")ifnotisinstance(col2,basestring):raiseValueError("col2 should be a string.")returnDataFrame(self._jdf.stat().crosstab(col1,col2),self.sql_ctx)

[docs]@since(1.4)deffreqItems(self,cols,support=None):""" Finding frequent items for columns, possibly with false positives. Using the frequent element count algorithm described in "http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou". :func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases. .. note:: This function is meant for exploratory data analysis, as we make no guarantee about the backward compatibility of the schema of the resulting DataFrame. :param cols: Names of the columns to calculate frequent items for as a list or tuple of strings. :param support: The frequency with which to consider an item 'frequent'. Default is 1%. The support must be greater than 1e-4. """ifisinstance(cols,tuple):cols=list(cols)ifnotisinstance(cols,list):raiseValueError("cols must be a list or tuple of column names as strings.")ifnotsupport:support=0.01returnDataFrame(self._jdf.stat().freqItems(_to_seq(self._sc,cols),support),self.sql_ctx)

[docs]@ignore_unicode_prefix@since(1.3)defwithColumn(self,colName,col):""" Returns a new :class:`DataFrame` by adding a column or replacing the existing column that has the same name. The column expression must be an expression over this DataFrame; attempting to add a column from some other dataframe will raise an error. :param colName: string, name of the new column. :param col: a :class:`Column` expression for the new column. >>> df.withColumn('age2', df.age + 2).collect() [Row(age=2, name=u'Alice', age2=4), Row(age=5, name=u'Bob', age2=7)] """assertisinstance(col,Column),"col should be Column"returnDataFrame(self._jdf.withColumn(colName,col._jc),self.sql_ctx)

[docs]@ignore_unicode_prefix@since(1.3)defwithColumnRenamed(self,existing,new):"""Returns a new :class:`DataFrame` by renaming an existing column. This is a no-op if schema doesn't contain the given column name. :param existing: string, name of the existing column to rename. :param col: string, new name of the column. >>> df.withColumnRenamed('age', 'age2').collect() [Row(age2=2, name=u'Alice'), Row(age2=5, name=u'Bob')] """returnDataFrame(self._jdf.withColumnRenamed(existing,new),self.sql_ctx)

[docs]@since(1.4)@ignore_unicode_prefixdefdrop(self,*cols):"""Returns a new :class:`DataFrame` that drops the specified column. This is a no-op if schema doesn't contain the given column name(s). :param cols: a string name of the column to drop, or a :class:`Column` to drop, or a list of string name of the columns to drop. >>> df.drop('age').collect() [Row(name=u'Alice'), Row(name=u'Bob')] >>> df.drop(df.age).collect() [Row(name=u'Alice'), Row(name=u'Bob')] >>> df.join(df2, df.name == df2.name, 'inner').drop(df.name).collect() [Row(age=5, height=85, name=u'Bob')] >>> df.join(df2, df.name == df2.name, 'inner').drop(df2.name).collect() [Row(age=5, name=u'Bob', height=85)] >>> df.join(df2, 'name', 'inner').drop('age', 'height').collect() [Row(name=u'Bob')] """iflen(cols)==1:col=cols[0]ifisinstance(col,basestring):jdf=self._jdf.drop(col)elifisinstance(col,Column):jdf=self._jdf.drop(col._jc)else:raiseTypeError("col should be a string or a Column")else:forcolincols:ifnotisinstance(col,basestring):raiseTypeError("each col in the param list should be a string")jdf=self._jdf.drop(self._jseq(cols))returnDataFrame(jdf,self.sql_ctx)

[docs]@since(1.3)deftoPandas(self):""" Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``. This is only available if Pandas is installed and available. .. note:: This method should only be used if the resulting Pandas's DataFrame is expected to be small, as all the data is loaded into the driver's memory. >>> df.toPandas() # doctest: +SKIP age name 0 2 Alice 1 5 Bob """frompyspark.sql.utilsimportrequire_minimum_pandas_versionrequire_minimum_pandas_version()importpandasaspdifself.sql_ctx.getConf("spark.sql.execution.pandas.respectSessionTimeZone").lower() \
=="true":timezone=self.sql_ctx.getConf("spark.sql.session.timeZone")else:timezone=Noneifself.sql_ctx.getConf("spark.sql.execution.arrow.enabled","false").lower()=="true":try:frompyspark.sql.typesimport_check_dataframe_convert_date, \
_check_dataframe_localize_timestamps,to_arrow_schemafrompyspark.sql.utilsimportrequire_minimum_pyarrow_versionrequire_minimum_pyarrow_version()importpyarrowto_arrow_schema(self.schema)tables=self._collectAsArrow()iftables:table=pyarrow.concat_tables(tables)pdf=table.to_pandas()pdf=_check_dataframe_convert_date(pdf,self.schema)return_check_dataframe_localize_timestamps(pdf,timezone)else:returnpd.DataFrame.from_records([],columns=self.columns)exceptExceptionase:msg=("Note: toPandas attempted Arrow optimization because ""'spark.sql.execution.arrow.enabled' is set to true. Please set it to false ""to disable this.")raiseRuntimeError("%s\n%s"%(_exception_message(e),msg))else:pdf=pd.DataFrame.from_records(self.collect(),columns=self.columns)dtype={}forfieldinself.schema:pandas_type=_to_corrected_pandas_type(field.dataType)# SPARK-21766: if an integer field is nullable and has null values, it can be# inferred by pandas as float column. Once we convert the column with NaN back# to integer type e.g., np.int16, we will hit exception. So we use the inferred# float type, not the corrected type from the schema in this case.ifpandas_typeisnotNoneand \
not(isinstance(field.dataType,IntegralType)andfield.nullableandpdf[field.name].isnull().any()):dtype[field.name]=pandas_typeforf,tindtype.items():pdf[f]=pdf[f].astype(t,copy=False)iftimezoneisNone:returnpdfelse:frompyspark.sql.typesimport_check_series_convert_timestamps_local_tzforfieldinself.schema:# TODO: handle nested timestamps, such as ArrayType(TimestampType())?ifisinstance(field.dataType,TimestampType):pdf[field.name]= \
_check_series_convert_timestamps_local_tz(pdf[field.name],timezone)returnpdf

def_collectAsArrow(self):""" Returns all records as list of deserialized ArrowPayloads, pyarrow must be installed and available. .. note:: Experimental. """withSCCallSiteSync(self._sc)ascss:port=self._jdf.collectAsArrowToPython()returnlist(_load_from_socket(port,ArrowSerializer()))########################################################################################### Pandas compatibility##########################################################################################groupby=copy_func(groupBy,sinceversion=1.4,doc=":func:`groupby` is an alias for :func:`groupBy`.")drop_duplicates=copy_func(dropDuplicates,sinceversion=1.4,doc=":func:`drop_duplicates` is an alias for :func:`dropDuplicates`.")where=copy_func(filter,sinceversion=1.3,doc=":func:`where` is an alias for :func:`filter`.")

def_to_scala_map(sc,jm):""" Convert a dict into a JVM Map. """returnsc._jvm.PythonUtils.toScalaMap(jm)def_to_corrected_pandas_type(dt):""" When converting Spark SQL records to Pandas DataFrame, the inferred data type may be wrong. This method gets the corrected data type for Pandas if that type may be inferred uncorrectly. """importnumpyasnpiftype(dt)==ByteType:returnnp.int8eliftype(dt)==ShortType:returnnp.int16eliftype(dt)==IntegerType:returnnp.int32eliftype(dt)==FloatType:returnnp.float32else:returnNone

[docs]classDataFrameNaFunctions(object):"""Functionality for working with missing data in :class:`DataFrame`. .. versionadded:: 1.4 """def__init__(self,df):self.df=df