Analyse des sentiments par courrier électronique à l’aide de Python et de Microsoft Azure – Partie 1
#Assign Body column to new object
email_body = email_data['Body']#Display top 5 rows and the overall length of the series
print(email_body.head())
print('n')
print("Starting email count:",email_body.shape)
#Removing r and n characters from strings
email_body = email_body.str.replace("r","")
email_body = email_body.str.replace("n","")#Display top 5 rows and the overall length of the series
print(email_body.head())
print('n')
print("Current e-mail count:",email_body.shape)
#Removing trailing email threads after start of my email signature
split_df = email_body.str.partition("Regards")
print(split_df[0:3])
print('n')
print("Current e-mail count:",split_df.shape)
#Removing extra fluff from partitioning
clean_col = split_df.drop(columns=[1,2]) #1 contains "Regards", 2 contains trailing text
#Removing rows with NaN - no data
clean_nan = clean_col.dropna()print("E-mail count before NaN removal:",clean_col.shape[0]) #Display before NaN removal
print("E-mail count after NaN removal:",clean_nan.shape[0]) #Display before NaN removal
#Updating the primary column with name EmailBody
clean_nan = clean_nan.rename(columns={0:"EmailBody"})#Remove emails with default out of office reply
clean_pto = clean_nan[~clean_nan.EmailBody.str.contains("Hello,I am currently")]#Remove emails with a forwarded message
cleaned_df = clean_pto[~clean_pto.EmailBody.str.contains("---------- Forwarded message ---------")]print("E-mail count before removals:",clean_nan.shape[0]) #Pre PTO count
print("E-mail count after removing PTO messages:",clean_pto.shape[0]) #Post PTO count
print("E-mail count after also removing forwarded messages:",cleaned_df.shape[0]) #Post fwd removal
#Considering we know we still have rows with no data, we'll replace the empty space with NaN
#We can see all visible rows with nothing now show NaN
cleaned_df['EmailBody'].replace(" ",np.nan,inplace=True)
print(cleaned_df)
#We can now find all rows with NaN and drop them using pd.dropna
cleaned_df = cleaned_df.dropna()
print(cleaned_df)
print('n')
print("E-mail count after dropping empty rows/rows with NaN:",cleaned_df.shape)
#Create an empty list to store values
#Iterate over each row in the dataframe and append it to the listsenti_list = []for row in range((cleaned_df.shape[0])):
senti_list.append(list(cleaned_df.iloc[row,:]))#Length of list matches length of old df
print("E-mail count before error removal, ready for analysis:",len(senti_list))