%%timeuser_ids=[]subreddit_ids=[]subreddit_to_id={}i=0withopen("reddit_user_posting_behavior.csv",'r')asf:forlineinf:forsrinline.rstrip().split(",")[1:]:ifsrnotinsubreddit_to_id:subreddit_to_id[sr]=len(subreddit_to_id)user_ids.append(i)subreddit_ids.append(subreddit_to_id[sr])i+=1importnumpyasnpfromscipy.sparseimportcsr_matrixrows=np.array(subreddit_ids)cols=np.array(user_ids)data=np.ones((len(user_ids),))num_rows=len(subreddit_to_id)num_cols=i# the code above exists to feed this calladj=csr_matrix((data,(rows,cols)),shape=(num_rows,num_cols))printadj.shapeprint""# now we have our matrix, so let's gather up a bit of info about itusers_per_subreddit=adj.sum(axis=1).A1subreddits=range(len(subreddit_to_id))forsrinsubreddit_to_id:subreddits[subreddit_to_id[sr]]=srsubreddits=np.array(subreddits)

# this function will show you the axes on which a particular subreddit scores the highest/lowestdefpickOutSubreddit(sr):sorted_axes=embedded_coords[list(subreddits).index(sr)].argsort()[::-1]returnpd.DataFrame(subreddits[np.argsort(embedded_coords[:,sorted_axes],axis=0)[::-1]],columns=sorted_axes)pickOutSubreddit("soccer")