importpandasaspd# load the datadata=pd.read_csv("csv_name.csv")# get first lines of dataprint(data.head(5))# get useful info regarding datasedprint(data.describe())print(train_set.info())# change print options of pandaspd.options.display.<tab># fill not existing variabels in column with mediandata['Age']=data["Age"].fillna(data["Age"].median())# select all 'male' records from the "Sex" columndata.loc[data["Sex"]=="male","Sex"]# replace all values with 0data.loc[data["Sex"]=="male","Sex"]=0data.loc[bool_mask,"column_to_fetch"]=value# get unique values from columndata["Embarked"].unique()# select only required rowsdata[["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]]# count values in data frame# get how many "True" entries exist in data frametmp_res.value_counts()[True]# another way to count data from listtitles=titanic["Name"].apply(get_title)print(pandas.value_counts(titles))# generate new column from previous onetitanic["FamilySize"]=titanic["SibSp"]+titanic["Parch"]# generate new column with .apply method# The .apply method generates a new seriestitanic["NameLength"]=titanic["Name"].apply(lambdax:len(x))# filter dataframe based on seriesdf=df[df.filename.isin(df_clear.filename)]# but sometimes more useful to apply method "by hands"# we have nearly the same perfomance, but more verbose output# previously you should install "tqdm"result_array=[]foriintqdm(xrange(0,len(data["column_name"]))):new_data=some_function(data["column_name"][i])result_array.append(new_data)# skip rows during data loadingtest_set=pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test',skiprows=1,header=None)# add names for columscolumn_labels=['first','second','title']test_set.columns=column_labels# remove all rows contains '?' question marktrain_set.replace(' ?',np.nan).dropna()# replace one value by another in columntest_nomissing['wage_class']=test_nomissing.wage_class.replace({' <=50K.':' <=50K',' >50K.':' >50K'})# concatenate two datasets verticallycombined_set=pd.concat([train_nomissing,test_nomissing],axis=0)# replace every sting value to the integer# Loop through all columns in the dataframeforfeatureincombined_set.columns:# Only apply for columns with categorical stringsifcombined_set[feature].dtype=='object':# Replace strings with an integercombined_set[feature]=pd.Categorical(combined_set[feature]).codes# create pandas data frame from data and save it# assume that data - nested list of lists, when each nested list - values for one linecolumns=['word_1','word_2','target_similarity','calculated_similarity']df=pd.DataFrame(data=data,columns=columns)df=pd.DataFrame(data=csv_lines,columns=columns)df.to_csv(save_file_name,index=False,header=True)# create new dataframe if not exists or use existing one# csv_columns = list of names for columns# csv_data = dict with keys same as columns namesifos.path.exists(csv_f_name):main_csv=pd.read_csv(csv_f_name,index_col=0)else:main_csv=pd.DataFrame(columns=csv_columns)main_csv=main_csv.append(csv_data,ignore_index=True)main_csv.to_csv(csv_f_name,index=True,header=True,na_rep='NaN')# convert sting list to python listfromastimportliteral_evalliteral_eval('[1.23, 2.34]')[1.23,2.34]