Navigation

Source code for pyspark.rdd

## Licensed to the Apache Software Foundation (ASF) under one or more# contributor license agreements. See the NOTICE file distributed with# this work for additional information regarding copyright ownership.# The ASF licenses this file to You under the Apache License, Version 2.0# (the "License"); you may not use this file except in compliance with# the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.#importcopyimportsysimportosimportreimportoperatorimportshleximportwarningsimportheapqimportbisectimportrandomimportsocketfromsubprocessimportPopen,PIPEfromtempfileimportNamedTemporaryFilefromthreadingimportThreadfromcollectionsimportdefaultdictfromitertoolsimportchainfromfunctoolsimportreducefrommathimportsqrt,log,isinf,isnan,pow,ceilifsys.version>'3':basestring=unicode=strelse:fromitertoolsimportimapasmap,ifilterasfilterfrompyspark.serializersimportNoOpSerializer,CartesianDeserializer, \
BatchedSerializer,CloudPickleSerializer,PairDeserializer, \
PickleSerializer,pack_long,AutoBatchedSerializerfrompyspark.joinimportpython_join,python_left_outer_join, \
python_right_outer_join,python_full_outer_join,python_cogroupfrompyspark.statcounterimportStatCounterfrompyspark.rddsamplerimportRDDSampler,RDDRangeSampler,RDDStratifiedSamplerfrompyspark.storagelevelimportStorageLevelfrompyspark.resultiterableimportResultIterablefrompyspark.shuffleimportAggregator,ExternalMerger, \
get_used_memory,ExternalSorter,ExternalGroupByfrompyspark.traceback_utilsimportSCCallSiteSync__all__=["RDD"]classPythonEvalType(object):""" Evaluation type of python rdd. These values are internal to PySpark. These values should match values in org.apache.spark.api.python.PythonEvalType. """NON_UDF=0SQL_BATCHED_UDF=100SQL_SCALAR_PANDAS_UDF=200SQL_GROUPED_MAP_PANDAS_UDF=201defportable_hash(x):""" This function returns consistent hash code for builtin types, especially for None and tuple with None. The algorithm is similar to that one used by CPython 2.7 >>> portable_hash(None) 0 >>> portable_hash((None, 1)) & 0xffffffff 219750521 """ifsys.version_info>=(3,2,3)and'PYTHONHASHSEED'notinos.environ:raiseException("Randomness of hash of string should be disabled via PYTHONHASHSEED")ifxisNone:return0ifisinstance(x,tuple):h=0x345678foriinx:h^=portable_hash(i)h*=1000003h&=sys.maxsizeh^=len(x)ifh==-1:h=-2returnint(h)returnhash(x)classBoundedFloat(float):""" Bounded value is generated by approximate job, with confidence and low bound and high bound. >>> BoundedFloat(100.0, 0.95, 95.0, 105.0) 100.0 """def__new__(cls,mean,confidence,low,high):obj=float.__new__(cls,mean)obj.confidence=confidenceobj.low=lowobj.high=highreturnobjdef_parse_memory(s):""" Parse a memory string in the format supported by Java (e.g. 1g, 200m) and return the value in MB >>> _parse_memory("256m") 256 >>> _parse_memory("2g") 2048 """units={'g':1024,'m':1,'t':1<<20,'k':1.0/1024}ifs[-1].lower()notinunits:raiseValueError("invalid format: "+s)returnint(float(s[:-1])*units[s[-1].lower()])def_load_from_socket(port,serializer):sock=None# Support for both IPv4 and IPv6.# On most of IPv6-ready systems, IPv6 will take precedence.forresinsocket.getaddrinfo("localhost",port,socket.AF_UNSPEC,socket.SOCK_STREAM):af,socktype,proto,canonname,sa=ressock=socket.socket(af,socktype,proto)try:sock.settimeout(15)sock.connect(sa)exceptsocket.error:sock.close()sock=Nonecontinuebreakifnotsock:raiseException("could not open socket")# The RDD materialization time is unpredicable, if we set a timeout for socket reading# operation, it will very possibly fail. See SPARK-18281.sock.settimeout(None)# The socket will be automatically closed when garbage-collected.returnserializer.load_stream(sock.makefile("rb",65536))defignore_unicode_prefix(f):""" Ignore the 'u' prefix of string in doc tests, to make it works in both python 2 and 3 """ifsys.version>='3':# the representation of unicode string in Python 3 does not have prefix 'u',# so remove the prefix 'u' for doc testsliteral_re=re.compile(r"(\W|^)[uU](['])",re.UNICODE)f.__doc__=literal_re.sub(r'\1\2',f.__doc__)returnfclassPartitioner(object):def__init__(self,numPartitions,partitionFunc):self.numPartitions=numPartitionsself.partitionFunc=partitionFuncdef__eq__(self,other):return(isinstance(other,Partitioner)andself.numPartitions==other.numPartitionsandself.partitionFunc==other.partitionFunc)def__call__(self,k):returnself.partitionFunc(k)%self.numPartitions

[docs]classRDD(object):""" A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable, partitioned collection of elements that can be operated on in parallel. """def__init__(self,jrdd,ctx,jrdd_deserializer=AutoBatchedSerializer(PickleSerializer())):self._jrdd=jrddself.is_cached=Falseself.is_checkpointed=Falseself.ctx=ctxself._jrdd_deserializer=jrdd_deserializerself._id=jrdd.id()self.partitioner=Nonedef_pickled(self):returnself._reserialize(AutoBatchedSerializer(PickleSerializer()))

[docs]defid(self):""" A unique ID for this RDD (within its SparkContext). """returnself._id

def__repr__(self):returnself._jrdd.toString()def__getnewargs__(self):# This method is called when attempting to pickle an RDD, which is always an error:raiseException("It appears that you are attempting to broadcast an RDD or reference an RDD from an ""action or transformation. RDD transformations and actions can only be invoked by the ""driver, not inside of other transformations; for example, ""rdd1.map(lambda x: rdd2.values.count() * x) is invalid because the values ""transformation and count action cannot be performed inside of the rdd1.map ""transformation. For more information, see SPARK-5063.")@propertydefcontext(self):""" The L{SparkContext} that this RDD was created on. """returnself.ctx

[docs]defpersist(self,storageLevel=StorageLevel.MEMORY_ONLY):""" Set this RDD's storage level to persist its values across operations after the first time it is computed. This can only be used to assign a new storage level if the RDD does not have a storage level set yet. If no storage level is specified defaults to (C{MEMORY_ONLY}). >>> rdd = sc.parallelize(["b", "a", "c"]) >>> rdd.persist().is_cached True """self.is_cached=TruejavaStorageLevel=self.ctx._getJavaStorageLevel(storageLevel)self._jrdd.persist(javaStorageLevel)returnself

[docs]defunpersist(self):""" Mark the RDD as non-persistent, and remove all blocks for it from memory and disk. """self.is_cached=Falseself._jrdd.unpersist()returnself

[docs]defcheckpoint(self):""" Mark this RDD for checkpointing. It will be saved to a file inside the checkpoint directory set with L{SparkContext.setCheckpointDir()} and all references to its parent RDDs will be removed. This function must be called before any job has been executed on this RDD. It is strongly recommended that this RDD is persisted in memory, otherwise saving it on a file will require recomputation. """self.is_checkpointed=Trueself._jrdd.rdd().checkpoint()

[docs]defisCheckpointed(self):""" Return whether this RDD is checkpointed and materialized, either reliably or locally. """returnself._jrdd.rdd().isCheckpointed()

[docs]deflocalCheckpoint(self):""" Mark this RDD for local checkpointing using Spark's existing caching layer. This method is for users who wish to truncate RDD lineages while skipping the expensive step of replicating the materialized data in a reliable distributed file system. This is useful for RDDs with long lineages that need to be truncated periodically (e.g. GraphX). Local checkpointing sacrifices fault-tolerance for performance. In particular, checkpointed data is written to ephemeral local storage in the executors instead of to a reliable, fault-tolerant storage. The effect is that if an executor fails during the computation, the checkpointed data may no longer be accessible, causing an irrecoverable job failure. This is NOT safe to use with dynamic allocation, which removes executors along with their cached blocks. If you must use both features, you are advised to set L{spark.dynamicAllocation.cachedExecutorIdleTimeout} to a high value. The checkpoint directory set through L{SparkContext.setCheckpointDir()} is not used. """self._jrdd.rdd().localCheckpoint()

[docs]defisLocallyCheckpointed(self):""" Return whether this RDD is marked for local checkpointing. Exposed for testing. """returnself._jrdd.rdd().isLocallyCheckpointed()

[docs]defgetCheckpointFile(self):""" Gets the name of the file to which this RDD was checkpointed Not defined if RDD is checkpointed locally. """checkpointFile=self._jrdd.rdd().getCheckpointFile()ifcheckpointFile.isDefined():returncheckpointFile.get()

[docs]defsample(self,withReplacement,fraction,seed=None):""" Return a sampled subset of this RDD. :param withReplacement: can elements be sampled multiple times (replaced when sampled out) :param fraction: expected size of the sample as a fraction of this RDD's size without replacement: probability that each element is chosen; fraction must be [0, 1] with replacement: expected number of times each element is chosen; fraction must be >= 0 :param seed: seed for the random number generator .. note:: This is not guaranteed to provide exactly the fraction specified of the total count of the given :class:`DataFrame`. >>> rdd = sc.parallelize(range(100), 4) >>> 6 <= rdd.sample(False, 0.1, 81).count() <= 14 True """assertfraction>=0.0,"Negative fraction value: %s"%fractionreturnself.mapPartitionsWithIndex(RDDSampler(withReplacement,fraction,seed).func,True)

[docs]deftakeSample(self,withReplacement,num,seed=None):""" Return a fixed-size sampled subset of this RDD. .. note:: This method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory. >>> rdd = sc.parallelize(range(0, 10)) >>> len(rdd.takeSample(True, 20, 1)) 20 >>> len(rdd.takeSample(False, 5, 2)) 5 >>> len(rdd.takeSample(False, 15, 3)) 10 """numStDev=10.0ifnum<0:raiseValueError("Sample size cannot be negative.")elifnum==0:return[]initialCount=self.count()ifinitialCount==0:return[]rand=random.Random(seed)if(notwithReplacement)andnum>=initialCount:# shuffle current RDD and returnsamples=self.collect()rand.shuffle(samples)returnsamplesmaxSampleSize=sys.maxsize-int(numStDev*sqrt(sys.maxsize))ifnum>maxSampleSize:raiseValueError("Sample size cannot be greater than %d."%maxSampleSize)fraction=RDD._computeFractionForSampleSize(num,initialCount,withReplacement)samples=self.sample(withReplacement,fraction,seed).collect()# If the first sample didn't turn out large enough, keep trying to take samples;# this shouldn't happen often because we use a big multiplier for their initial size.# See: scala/spark/RDD.scalawhilelen(samples)<num:# TODO: add log warning for when more than one iteration was runseed=rand.randint(0,sys.maxsize)samples=self.sample(withReplacement,fraction,seed).collect()rand.shuffle(samples)returnsamples[0:num]

@staticmethoddef_computeFractionForSampleSize(sampleSizeLowerBound,total,withReplacement):""" Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time. How the sampling rate is determined: Let p = num / total, where num is the sample size and total is the total number of data points in the RDD. We're trying to compute q > p such that - when sampling with replacement, we're drawing each data point with prob_i ~ Pois(q), where we want to guarantee Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to total), i.e. the failure rate of not having a sufficiently large sample < 0.0001. Setting q = p + 5 * sqrt(p/total) is sufficient to guarantee 0.9999 success rate for num > 12, but we need a slightly larger q (9 empirically determined). - when sampling without replacement, we're drawing each data point with prob_i ~ Binomial(total, fraction) and our choice of q guarantees 1-delta, or 0.9999 success rate, where success rate is defined the same as in sampling with replacement. """fraction=float(sampleSizeLowerBound)/totalifwithReplacement:numStDev=5if(sampleSizeLowerBound<12):numStDev=9returnfraction+numStDev*sqrt(fraction/total)else:delta=0.00005gamma=-log(delta)/totalreturnmin(1,fraction+gamma+sqrt(gamma*gamma+2*gamma*fraction))

[docs]defcartesian(self,other):""" Return the Cartesian product of this RDD and another one, that is, the RDD of all pairs of elements C{(a, b)} where C{a} is in C{self} and C{b} is in C{other}. >>> rdd = sc.parallelize([1, 2]) >>> sorted(rdd.cartesian(rdd).collect()) [(1, 1), (1, 2), (2, 1), (2, 2)] """# Due to batching, we can't use the Java cartesian method.deserializer=CartesianDeserializer(self._jrdd_deserializer,other._jrdd_deserializer)returnRDD(self._jrdd.cartesian(other._jrdd),self.ctx,deserializer)

[docs]defcollect(self):""" Return a list that contains all of the elements in this RDD. .. note:: This method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory. """withSCCallSiteSync(self.context)ascss:port=self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())returnlist(_load_from_socket(port,self._jrdd_deserializer))

[docs]deffold(self,zeroValue,op):""" Aggregate the elements of each partition, and then the results for all the partitions, using a given associative function and a neutral "zero value." The function C{op(t1, t2)} is allowed to modify C{t1} and return it as its result value to avoid object allocation; however, it should not modify C{t2}. This behaves somewhat differently from fold operations implemented for non-distributed collections in functional languages like Scala. This fold operation may be applied to partitions individually, and then fold those results into the final result, rather than apply the fold to each element sequentially in some defined ordering. For functions that are not commutative, the result may differ from that of a fold applied to a non-distributed collection. >>> from operator import add >>> sc.parallelize([1, 2, 3, 4, 5]).fold(0, add) 15 """deffunc(iterator):acc=zeroValueforobjiniterator:acc=op(acc,obj)yieldacc# collecting result of mapPartitions here ensures that the copy of# zeroValue provided to each partition is unique from the one provided# to the final reduce callvals=self.mapPartitions(func).collect()returnreduce(op,vals,zeroValue)

[docs]defaggregate(self,zeroValue,seqOp,combOp):""" Aggregate the elements of each partition, and then the results for all the partitions, using a given combine functions and a neutral "zero value." The functions C{op(t1, t2)} is allowed to modify C{t1} and return it as its result value to avoid object allocation; however, it should not modify C{t2}. The first function (seqOp) can return a different result type, U, than the type of this RDD. Thus, we need one operation for merging a T into an U and one operation for merging two U >>> seqOp = (lambda x, y: (x[0] + y, x[1] + 1)) >>> combOp = (lambda x, y: (x[0] + y[0], x[1] + y[1])) >>> sc.parallelize([1, 2, 3, 4]).aggregate((0, 0), seqOp, combOp) (10, 4) >>> sc.parallelize([]).aggregate((0, 0), seqOp, combOp) (0, 0) """deffunc(iterator):acc=zeroValueforobjiniterator:acc=seqOp(acc,obj)yieldacc# collecting result of mapPartitions here ensures that the copy of# zeroValue provided to each partition is unique from the one provided# to the final reduce callvals=self.mapPartitions(func).collect()returnreduce(combOp,vals,zeroValue)

[docs]defcount(self):""" Return the number of elements in this RDD. >>> sc.parallelize([2, 3, 4]).count() 3 """returnself.mapPartitions(lambdai:[sum(1for_ini)]).sum()

[docs]defstats(self):""" Return a L{StatCounter} object that captures the mean, variance and count of the RDD's elements in one operation. """defredFunc(left_counter,right_counter):returnleft_counter.mergeStats(right_counter)returnself.mapPartitions(lambdai:[StatCounter(i)]).reduce(redFunc)

[docs]defhistogram(self,buckets):""" Compute a histogram using the provided buckets. The buckets are all open to the right except for the last which is closed. e.g. [1,10,20,50] means the buckets are [1,10) [10,20) [20,50], which means 1<=x<10, 10<=x<20, 20<=x<=50. And on the input of 1 and 50 we would have a histogram of 1,0,1. If your histogram is evenly spaced (e.g. [0, 10, 20, 30]), this can be switched from an O(log n) inseration to O(1) per element (where n is the number of buckets). Buckets must be sorted, not contain any duplicates, and have at least two elements. If `buckets` is a number, it will generate buckets which are evenly spaced between the minimum and maximum of the RDD. For example, if the min value is 0 and the max is 100, given `buckets` as 2, the resulting buckets will be [0,50) [50,100]. `buckets` must be at least 1. An exception is raised if the RDD contains infinity. If the elements in the RDD do not vary (max == min), a single bucket will be used. The return value is a tuple of buckets and histogram. >>> rdd = sc.parallelize(range(51)) >>> rdd.histogram(2) ([0, 25, 50], [25, 26]) >>> rdd.histogram([0, 5, 25, 50]) ([0, 5, 25, 50], [5, 20, 26]) >>> rdd.histogram([0, 15, 30, 45, 60]) # evenly spaced buckets ([0, 15, 30, 45, 60], [15, 15, 15, 6]) >>> rdd = sc.parallelize(["ab", "ac", "b", "bd", "ef"]) >>> rdd.histogram(("a", "b", "c")) (('a', 'b', 'c'), [2, 2]) """ifisinstance(buckets,int):ifbuckets<1:raiseValueError("number of buckets must be >= 1")# filter out non-comparable elementsdefcomparable(x):ifxisNone:returnFalseiftype(x)isfloatandisnan(x):returnFalsereturnTruefiltered=self.filter(comparable)# faster than stats()defminmax(a,b):returnmin(a[0],b[0]),max(a[1],b[1])try:minv,maxv=filtered.map(lambdax:(x,x)).reduce(minmax)exceptTypeErrorase:if" empty "instr(e):raiseValueError("can not generate buckets from empty RDD")raiseifminv==maxvorbuckets==1:return[minv,maxv],[filtered.count()]try:inc=(maxv-minv)/bucketsexceptTypeError:raiseTypeError("Can not generate buckets with non-number in RDD")ifisinf(inc):raiseValueError("Can not generate buckets with infinite value")# keep them as integer if possibleinc=int(inc)ifinc*buckets!=maxv-minv:inc=(maxv-minv)*1.0/bucketsbuckets=[i*inc+minvforiinrange(buckets)]buckets.append(maxv)# fix accumulated erroreven=Trueelifisinstance(buckets,(list,tuple)):iflen(buckets)<2:raiseValueError("buckets should have more than one value")ifany(iisNoneorisinstance(i,float)andisnan(i)foriinbuckets):raiseValueError("can not have None or NaN in buckets")ifsorted(buckets)!=list(buckets):raiseValueError("buckets should be sorted")iflen(set(buckets))!=len(buckets):raiseValueError("buckets should not contain duplicated values")minv=buckets[0]maxv=buckets[-1]even=Falseinc=Nonetry:steps=[buckets[i+1]-buckets[i]foriinrange(len(buckets)-1)]exceptTypeError:pass# objects in buckets do not support '-'else:ifmax(steps)-min(steps)<1e-10:# handle precision errorseven=Trueinc=(maxv-minv)/(len(buckets)-1)else:raiseTypeError("buckets should be a list or tuple or number(int or long)")defhistogram(iterator):counters=[0]*len(buckets)foriiniterator:ifiisNoneor(type(i)isfloatandisnan(i))ori>maxvori<minv:continuet=(int((i-minv)/inc)ifevenelsebisect.bisect_right(buckets,i)-1)counters[t]+=1# add last two togetherlast=counters.pop()counters[-1]+=lastreturn[counters]defmergeCounters(a,b):return[i+jfori,jinzip(a,b)]returnbuckets,self.mapPartitions(histogram).reduce(mergeCounters)

[docs]deftop(self,num,key=None):""" Get the top N elements from an RDD. .. note:: This method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory. .. note:: It returns the list sorted in descending order. >>> sc.parallelize([10, 4, 2, 12, 3]).top(1) [12] >>> sc.parallelize([2, 3, 4, 5, 6], 2).top(2) [6, 5] >>> sc.parallelize([10, 4, 2, 12, 3]).top(3, key=str) [4, 3, 2] """deftopIterator(iterator):yieldheapq.nlargest(num,iterator,key=key)defmerge(a,b):returnheapq.nlargest(num,a+b,key=key)returnself.mapPartitions(topIterator).reduce(merge)

[docs]deftakeOrdered(self,num,key=None):""" Get the N elements from an RDD ordered in ascending order or as specified by the optional key function. .. note:: this method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory. >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7]).takeOrdered(6) [1, 2, 3, 4, 5, 6] >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7], 2).takeOrdered(6, key=lambda x: -x) [10, 9, 7, 6, 5, 4] """defmerge(a,b):returnheapq.nsmallest(num,a+b,key)returnself.mapPartitions(lambdait:[heapq.nsmallest(num,it,key)]).reduce(merge)

[docs]deftake(self,num):""" Take the first num elements of the RDD. It works by first scanning one partition, and use the results from that partition to estimate the number of additional partitions needed to satisfy the limit. Translated from the Scala implementation in RDD#take(). .. note:: this method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory. >>> sc.parallelize([2, 3, 4, 5, 6]).cache().take(2) [2, 3] >>> sc.parallelize([2, 3, 4, 5, 6]).take(10) [2, 3, 4, 5, 6] >>> sc.parallelize(range(100), 100).filter(lambda x: x > 90).take(3) [91, 92, 93] """items=[]totalParts=self.getNumPartitions()partsScanned=0whilelen(items)<numandpartsScanned<totalParts:# The number of partitions to try in this iteration.# It is ok for this number to be greater than totalParts because# we actually cap it at totalParts in runJob.numPartsToTry=1ifpartsScanned>0:# If we didn't find any rows after the previous iteration,# quadruple and retry. Otherwise, interpolate the number of# partitions we need to try, but overestimate it by 50%.# We also cap the estimation in the end.iflen(items)==0:numPartsToTry=partsScanned*4else:# the first paramter of max is >=1 whenever partsScanned >= 2numPartsToTry=int(1.5*num*partsScanned/len(items))-partsScannednumPartsToTry=min(max(numPartsToTry,1),partsScanned*4)left=num-len(items)deftakeUpToNumLeft(iterator):iterator=iter(iterator)taken=0whiletaken<left:yieldnext(iterator)taken+=1p=range(partsScanned,min(partsScanned+numPartsToTry,totalParts))res=self.context.runJob(self,takeUpToNumLeft,p)items+=respartsScanned+=numPartsToTryreturnitems[:num]

[docs]defisEmpty(self):""" Returns true if and only if the RDD contains no elements at all. .. note:: an RDD may be empty even when it has at least 1 partition. >>> sc.parallelize([]).isEmpty() True >>> sc.parallelize([1]).isEmpty() False """returnself.getNumPartitions()==0orlen(self.take(1))==0

[docs]defsaveAsNewAPIHadoopDataset(self,conf,keyConverter=None,valueConverter=None):""" Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file system, using the new Hadoop OutputFormat API (mapreduce package). Keys/values are converted for output using either user specified converters or, by default, L{org.apache.spark.api.python.JavaToWritableConverter}. :param conf: Hadoop job configuration, passed in as a dict :param keyConverter: (None by default) :param valueConverter: (None by default) """jconf=self.ctx._dictToJavaMap(conf)pickledRDD=self._pickled()self.ctx._jvm.PythonRDD.saveAsHadoopDataset(pickledRDD._jrdd,True,jconf,keyConverter,valueConverter,True)

[docs]defsaveAsNewAPIHadoopFile(self,path,outputFormatClass,keyClass=None,valueClass=None,keyConverter=None,valueConverter=None,conf=None):""" Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file system, using the new Hadoop OutputFormat API (mapreduce package). Key and value types will be inferred if not specified. Keys and values are converted for output using either user specified converters or L{org.apache.spark.api.python.JavaToWritableConverter}. The C{conf} is applied on top of the base Hadoop conf associated with the SparkContext of this RDD to create a merged Hadoop MapReduce job configuration for saving the data. :param path: path to Hadoop file :param outputFormatClass: fully qualified classname of Hadoop OutputFormat (e.g. "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat") :param keyClass: fully qualified classname of key Writable class (e.g. "org.apache.hadoop.io.IntWritable", None by default) :param valueClass: fully qualified classname of value Writable class (e.g. "org.apache.hadoop.io.Text", None by default) :param keyConverter: (None by default) :param valueConverter: (None by default) :param conf: Hadoop job configuration, passed in as a dict (None by default) """jconf=self.ctx._dictToJavaMap(conf)pickledRDD=self._pickled()self.ctx._jvm.PythonRDD.saveAsNewAPIHadoopFile(pickledRDD._jrdd,True,path,outputFormatClass,keyClass,valueClass,keyConverter,valueConverter,jconf)

[docs]defsaveAsHadoopDataset(self,conf,keyConverter=None,valueConverter=None):""" Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file system, using the old Hadoop OutputFormat API (mapred package). Keys/values are converted for output using either user specified converters or, by default, L{org.apache.spark.api.python.JavaToWritableConverter}. :param conf: Hadoop job configuration, passed in as a dict :param keyConverter: (None by default) :param valueConverter: (None by default) """jconf=self.ctx._dictToJavaMap(conf)pickledRDD=self._pickled()self.ctx._jvm.PythonRDD.saveAsHadoopDataset(pickledRDD._jrdd,True,jconf,keyConverter,valueConverter,False)

[docs]defsaveAsSequenceFile(self,path,compressionCodecClass=None):""" Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file system, using the L{org.apache.hadoop.io.Writable} types that we convert from the RDD's key and value types. The mechanism is as follows: 1. Pyrolite is used to convert pickled Python RDD into RDD of Java objects. 2. Keys and values of this Java RDD are converted to Writables and written out. :param path: path to sequence file :param compressionCodecClass: (None by default) """pickledRDD=self._pickled()self.ctx._jvm.PythonRDD.saveAsSequenceFile(pickledRDD._jrdd,True,path,compressionCodecClass)

[docs]defcollectAsMap(self):""" Return the key-value pairs in this RDD to the master as a dictionary. .. note:: this method should only be used if the resulting data is expected to be small, as all the data is loaded into the driver's memory. >>> m = sc.parallelize([(1, 2), (3, 4)]).collectAsMap() >>> m[1] 2 >>> m[3] 4 """returndict(self.collect())

[docs]defreduceByKey(self,func,numPartitions=None,partitionFunc=portable_hash):""" Merge the values for each key using an associative and commutative reduce function. This will also perform the merging locally on each mapper before sending results to a reducer, similarly to a "combiner" in MapReduce. Output will be partitioned with C{numPartitions} partitions, or the default parallelism level if C{numPartitions} is not specified. Default partitioner is hash-partition. >>> from operator import add >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) >>> sorted(rdd.reduceByKey(add).collect()) [('a', 2), ('b', 1)] """returnself.combineByKey(lambdax:x,func,func,numPartitions,partitionFunc)

[docs]defreduceByKeyLocally(self,func):""" Merge the values for each key using an associative and commutative reduce function, but return the results immediately to the master as a dictionary. This will also perform the merging locally on each mapper before sending results to a reducer, similarly to a "combiner" in MapReduce. >>> from operator import add >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) >>> sorted(rdd.reduceByKeyLocally(add).items()) [('a', 2), ('b', 1)] """defreducePartition(iterator):m={}fork,viniterator:m[k]=func(m[k],v)ifkinmelsevyieldmdefmergeMaps(m1,m2):fork,vinm2.items():m1[k]=func(m1[k],v)ifkinm1elsevreturnm1returnself.mapPartitions(reducePartition).reduce(mergeMaps)

[docs]defcountByKey(self):""" Count the number of elements for each key, and return the result to the master as a dictionary. >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) >>> sorted(rdd.countByKey().items()) [('a', 2), ('b', 1)] """returnself.map(lambdax:x[0]).countByValue()

[docs]defjoin(self,other,numPartitions=None):""" Return an RDD containing all pairs of elements with matching keys in C{self} and C{other}. Each pair of elements will be returned as a (k, (v1, v2)) tuple, where (k, v1) is in C{self} and (k, v2) is in C{other}. Performs a hash join across the cluster. >>> x = sc.parallelize([("a", 1), ("b", 4)]) >>> y = sc.parallelize([("a", 2), ("a", 3)]) >>> sorted(x.join(y).collect()) [('a', (1, 2)), ('a', (1, 3))] """returnpython_join(self,other,numPartitions)

[docs]defleftOuterJoin(self,other,numPartitions=None):""" Perform a left outer join of C{self} and C{other}. For each element (k, v) in C{self}, the resulting RDD will either contain all pairs (k, (v, w)) for w in C{other}, or the pair (k, (v, None)) if no elements in C{other} have key k. Hash-partitions the resulting RDD into the given number of partitions. >>> x = sc.parallelize([("a", 1), ("b", 4)]) >>> y = sc.parallelize([("a", 2)]) >>> sorted(x.leftOuterJoin(y).collect()) [('a', (1, 2)), ('b', (4, None))] """returnpython_left_outer_join(self,other,numPartitions)

[docs]defrightOuterJoin(self,other,numPartitions=None):""" Perform a right outer join of C{self} and C{other}. For each element (k, w) in C{other}, the resulting RDD will either contain all pairs (k, (v, w)) for v in this, or the pair (k, (None, w)) if no elements in C{self} have key k. Hash-partitions the resulting RDD into the given number of partitions. >>> x = sc.parallelize([("a", 1), ("b", 4)]) >>> y = sc.parallelize([("a", 2)]) >>> sorted(y.rightOuterJoin(x).collect()) [('a', (2, 1)), ('b', (None, 4))] """returnpython_right_outer_join(self,other,numPartitions)

[docs]deffullOuterJoin(self,other,numPartitions=None):""" Perform a right outer join of C{self} and C{other}. For each element (k, v) in C{self}, the resulting RDD will either contain all pairs (k, (v, w)) for w in C{other}, or the pair (k, (v, None)) if no elements in C{other} have key k. Similarly, for each element (k, w) in C{other}, the resulting RDD will either contain all pairs (k, (v, w)) for v in C{self}, or the pair (k, (None, w)) if no elements in C{self} have key k. Hash-partitions the resulting RDD into the given number of partitions. >>> x = sc.parallelize([("a", 1), ("b", 4)]) >>> y = sc.parallelize([("a", 2), ("c", 8)]) >>> sorted(x.fullOuterJoin(y).collect()) [('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))] """returnpython_full_outer_join(self,other,numPartitions)

# TODO: add option to control map-side combining# portable_hash is used as default, because builtin hash of None is different# cross machines.

[docs]defcombineByKey(self,createCombiner,mergeValue,mergeCombiners,numPartitions=None,partitionFunc=portable_hash):""" Generic function to combine the elements for each key using a custom set of aggregation functions. Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a "combined type" C. Users provide three functions: - C{createCombiner}, which turns a V into a C (e.g., creates a one-element list) - C{mergeValue}, to merge a V into a C (e.g., adds it to the end of a list) - C{mergeCombiners}, to combine two C's into a single one (e.g., merges the lists) To avoid memory allocation, both mergeValue and mergeCombiners are allowed to modify and return their first argument instead of creating a new C. In addition, users can control the partitioning of the output RDD. .. note:: V and C can be different -- for example, one might group an RDD of type (Int, Int) into an RDD of type (Int, List[Int]). >>> x = sc.parallelize([("a", 1), ("b", 1), ("a", 2)]) >>> def to_list(a): ... return [a] ... >>> def append(a, b): ... a.append(b) ... return a ... >>> def extend(a, b): ... a.extend(b) ... return a ... >>> sorted(x.combineByKey(to_list, append, extend).collect()) [('a', [1, 2]), ('b', [1])] """ifnumPartitionsisNone:numPartitions=self._defaultReducePartitions()serializer=self.ctx.serializermemory=self._memory_limit()agg=Aggregator(createCombiner,mergeValue,mergeCombiners)defcombineLocally(iterator):merger=ExternalMerger(agg,memory*0.9,serializer)merger.mergeValues(iterator)returnmerger.items()locally_combined=self.mapPartitions(combineLocally,preservesPartitioning=True)shuffled=locally_combined.partitionBy(numPartitions,partitionFunc)def_mergeCombiners(iterator):merger=ExternalMerger(agg,memory,serializer)merger.mergeCombiners(iterator)returnmerger.items()returnshuffled.mapPartitions(_mergeCombiners,preservesPartitioning=True)

[docs]defaggregateByKey(self,zeroValue,seqFunc,combFunc,numPartitions=None,partitionFunc=portable_hash):""" Aggregate the values of each key, using given combine functions and a neutral "zero value". This function can return a different result type, U, than the type of the values in this RDD, V. Thus, we need one operation for merging a V into a U and one operation for merging two U's, The former operation is used for merging values within a partition, and the latter is used for merging values between partitions. To avoid memory allocation, both of these functions are allowed to modify and return their first argument instead of creating a new U. """defcreateZero():returncopy.deepcopy(zeroValue)returnself.combineByKey(lambdav:seqFunc(createZero(),v),seqFunc,combFunc,numPartitions,partitionFunc)

[docs]deffoldByKey(self,zeroValue,func,numPartitions=None,partitionFunc=portable_hash):""" Merge the values for each key using an associative function "func" and a neutral "zeroValue" which may be added to the result an arbitrary number of times, and must not change the result (e.g., 0 for addition, or 1 for multiplication.). >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) >>> from operator import add >>> sorted(rdd.foldByKey(0, add).collect()) [('a', 2), ('b', 1)] """defcreateZero():returncopy.deepcopy(zeroValue)returnself.combineByKey(lambdav:func(createZero(),v),func,func,numPartitions,partitionFunc)

[docs]defgroupByKey(self,numPartitions=None,partitionFunc=portable_hash):""" Group the values for each key in the RDD into a single sequence. Hash-partitions the resulting RDD with numPartitions partitions. .. note:: If you are grouping in order to perform an aggregation (such as a sum or average) over each key, using reduceByKey or aggregateByKey will provide much better performance. >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) >>> sorted(rdd.groupByKey().mapValues(len).collect()) [('a', 2), ('b', 1)] >>> sorted(rdd.groupByKey().mapValues(list).collect()) [('a', [1, 1]), ('b', [1])] """defcreateCombiner(x):return[x]defmergeValue(xs,x):xs.append(x)returnxsdefmergeCombiners(a,b):a.extend(b)returnamemory=self._memory_limit()serializer=self._jrdd_deserializeragg=Aggregator(createCombiner,mergeValue,mergeCombiners)defcombine(iterator):merger=ExternalMerger(agg,memory*0.9,serializer)merger.mergeValues(iterator)returnmerger.items()locally_combined=self.mapPartitions(combine,preservesPartitioning=True)shuffled=locally_combined.partitionBy(numPartitions,partitionFunc)defgroupByKey(it):merger=ExternalGroupBy(agg,memory,serializer)merger.mergeCombiners(it)returnmerger.items()returnshuffled.mapPartitions(groupByKey,True).mapValues(ResultIterable)

[docs]defrepartition(self,numPartitions):""" Return a new RDD that has exactly numPartitions partitions. Can increase or decrease the level of parallelism in this RDD. Internally, this uses a shuffle to redistribute data. If you are decreasing the number of partitions in this RDD, consider using `coalesce`, which can avoid performing a shuffle. >>> rdd = sc.parallelize([1,2,3,4,5,6,7], 4) >>> sorted(rdd.glom().collect()) [[1], [2, 3], [4, 5], [6, 7]] >>> len(rdd.repartition(2).glom().collect()) 2 >>> len(rdd.repartition(10).glom().collect()) 10 """returnself.coalesce(numPartitions,shuffle=True)

[docs]defzip(self,other):""" Zips this RDD with another one, returning key-value pairs with the first element in each RDD second element in each RDD, etc. Assumes that the two RDDs have the same number of partitions and the same number of elements in each partition (e.g. one was made through a map on the other). >>> x = sc.parallelize(range(0,5)) >>> y = sc.parallelize(range(1000, 1005)) >>> x.zip(y).collect() [(0, 1000), (1, 1001), (2, 1002), (3, 1003), (4, 1004)] """defget_batch_size(ser):ifisinstance(ser,BatchedSerializer):returnser.batchSizereturn1# not batcheddefbatch_as(rdd,batchSize):returnrdd._reserialize(BatchedSerializer(PickleSerializer(),batchSize))my_batch=get_batch_size(self._jrdd_deserializer)other_batch=get_batch_size(other._jrdd_deserializer)ifmy_batch!=other_batchornotmy_batch:# use the smallest batchSize for both of thembatchSize=min(my_batch,other_batch)ifbatchSize<=0:# auto batched or unlimitedbatchSize=100other=batch_as(other,batchSize)self=batch_as(self,batchSize)ifself.getNumPartitions()!=other.getNumPartitions():raiseValueError("Can only zip with RDD which has the same number of partitions")# There will be an Exception in JVM if there are different number# of items in each partitions.pairRDD=self._jrdd.zip(other._jrdd)deserializer=PairDeserializer(self._jrdd_deserializer,other._jrdd_deserializer)returnRDD(pairRDD,self.ctx,deserializer)

[docs]defzipWithIndex(self):""" Zips this RDD with its element indices. The ordering is first based on the partition index and then the ordering of items within each partition. So the first item in the first partition gets index 0, and the last item in the last partition receives the largest index. This method needs to trigger a spark job when this RDD contains more than one partitions. >>> sc.parallelize(["a", "b", "c", "d"], 3).zipWithIndex().collect() [('a', 0), ('b', 1), ('c', 2), ('d', 3)] """starts=[0]ifself.getNumPartitions()>1:nums=self.mapPartitions(lambdait:[sum(1foriinit)]).collect()foriinrange(len(nums)-1):starts.append(starts[-1]+nums[i])deffunc(k,it):fori,vinenumerate(it,starts[k]):yieldv,ireturnself.mapPartitionsWithIndex(func)

[docs]defzipWithUniqueId(self):""" Zips this RDD with generated unique Long ids. Items in the kth partition will get ids k, n+k, 2*n+k, ..., where n is the number of partitions. So there may exist gaps, but this method won't trigger a spark job, which is different from L{zipWithIndex} >>> sc.parallelize(["a", "b", "c", "d", "e"], 3).zipWithUniqueId().collect() [('a', 0), ('b', 1), ('c', 4), ('d', 2), ('e', 5)] """n=self.getNumPartitions()deffunc(k,it):fori,vinenumerate(it):yieldv,i*n+kreturnself.mapPartitionsWithIndex(func)

[docs]defname(self):""" Return the name of this RDD. """n=self._jrdd.name()ifn:returnn

def_defaultReducePartitions(self):""" Returns the default number of partitions to use during reduce tasks (e.g., groupBy). If spark.default.parallelism is set, then we'll use the value from SparkContext defaultParallelism, otherwise we'll use the number of partitions in this RDD. This mirrors the behavior of the Scala Partitioner#defaultPartitioner, intended to reduce the likelihood of OOMs. Once PySpark adopts Partitioner-based APIs, this behavior will be inherent. """ifself.ctx._conf.contains("spark.default.parallelism"):returnself.ctx.defaultParallelismelse:returnself.getNumPartitions()

def_to_java_object_rdd(self):""" Return a JavaRDD of Object by unpickling It will convert each Python object into Java object by Pyrolite, whenever the RDD is serialized in batch or not. """rdd=self._pickled()returnself.ctx._jvm.SerDeUtil.pythonToJava(rdd._jrdd,True)

[docs]defcountApprox(self,timeout,confidence=0.95):""" .. note:: Experimental Approximate version of count() that returns a potentially incomplete result within a timeout, even if not all tasks have finished. >>> rdd = sc.parallelize(range(1000), 10) >>> rdd.countApprox(1000, 1.0) 1000 """drdd=self.mapPartitions(lambdait:[float(sum(1foriinit))])returnint(drdd.sumApprox(timeout,confidence))

[docs]defcountApproxDistinct(self,relativeSD=0.05):""" .. note:: Experimental Return approximate number of distinct elements in the RDD. The algorithm used is based on streamlib's implementation of `"HyperLogLog in Practice: Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available here <http://dx.doi.org/10.1145/2452376.2452456>`_. :param relativeSD: Relative accuracy. Smaller values create counters that require more space. It must be greater than 0.000017. >>> n = sc.parallelize(range(1000)).map(str).countApproxDistinct() >>> 900 < n < 1100 True >>> n = sc.parallelize([i % 20 for i in range(1000)]).countApproxDistinct() >>> 16 < n < 24 True """ifrelativeSD<0.000017:raiseValueError("relativeSD should be greater than 0.000017")# the hash space in Java is 2^32hashRDD=self.map(lambdax:portable_hash(x)&0xFFFFFFFF)returnhashRDD._to_java_object_rdd().countApproxDistinct(relativeSD)

[docs]deftoLocalIterator(self):""" Return an iterator that contains all of the elements in this RDD. The iterator will consume as much memory as the largest partition in this RDD. >>> rdd = sc.parallelize(range(10)) >>> [x for x in rdd.toLocalIterator()] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] """withSCCallSiteSync(self.context)ascss:port=self.ctx._jvm.PythonRDD.toLocalIteratorAndServe(self._jrdd.rdd())return_load_from_socket(port,self._jrdd_deserializer)