In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import calendar
from datetime import datetime, timedelta
In [3]:
df = pd.read_csv(r'Data/Jaipur_Rainfall_Data.csv')

#get Hour columns for use in analysis
hour_cols = list(pd.to_numeric(df.columns, errors = 'coerce').dropna().astype(int))
hour_cols = list(map(str, hour_cols))

#convert month string to month number
map_month = dict((v.upper(),k) for k,v in enumerate(calendar.month_abbr)) #mapping for months
df['MONTH'] = df['MONTH'].map(map_month)

#Convert Year Month Date to DATETIME
df['DATETIME'] = pd.to_datetime((df.YEAR*10000+df.MONTH*100+df.DAY).apply(str),format='%Y%m%d')

#convert hour column to unpivot table
df = pd.melt(df, id_vars=['DATETIME'], value_vars=hour_cols)

#Add Hour to time
df['DATETIME'] =  df['DATETIME'] + pd.TimedeltaIndex((df.variable.astype(float) - 1), 'H')
df = df.drop(columns=['variable'])
df.columns = ['DATETIME', 'RAINFALL']
df['RAINFALL'] =  pd.to_numeric(df['RAINFALL'], errors='coerce')

#Recover recoverable data by month mean
df = df.fillna(df.groupby(df['DATETIME'].dt.month).transform('mean'))

#Remove Bad Missing data year
#df = df[df['DATETIME'].dt.year == 2011].reset_index(drop=True) #2011 has missing data for monsoon season
df.index = df['DATETIME']
In [4]:
df.resample('h').sum().plot(kind='line') #Hourly 
Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x286be4ae780>
In [5]:
df.resample('d').sum().plot(kind='line') #Daily
Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x286bffa60f0>
In [6]:
df.resample('w').sum().plot(kind='line') #Weekly
Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x286c0088550>
In [7]:
df.resample('M').sum().plot(kind='line') #Monthly
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x286c005ee10>
In [8]:
df.resample('Y').sum().plot(kind='line') #Yearly
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x286c00da160>
In [22]:
df.resample('D').sum().groupby(df['DATETIME'].dt.weekday_name).mean() #Monthly
Out[22]:
RAINFALL
DATETIME
Friday 1.280612
Monday 1.434431
Saturday 0.928189
Sunday 1.629153
Thursday 1.388936
Tuesday 1.588853
Wednesday 1.882058
In [41]:
df.resample('w').sum().groupby([df['DATETIME'].dt.week, df['DATETIME'].dt.month.apply(lambda x: calendar.month_abbr[x])]).mean() #Weekly
Out[41]:
RAINFALL
DATETIME DATETIME
1.0 Jan 0.000000
2.0 Jan 0.500000
3.0 Jan 0.028571
4.0 Jan 0.000000
5.0 Feb 0.728571
6.0 Feb 3.142857
7.0 Feb 3.671429
8.0 Feb 0.528571
9.0 Mar 1.378187
10.0 Mar 1.563748
11.0 Mar 2.178034
12.0 Mar 0.649462
13.0 Apr 0.230573
Mar 0.000000
14.0 Apr 1.802778
15.0 Apr 2.759921
16.0 Apr 0.902778
17.0 Apr 0.363889
May 1.150000
18.0 May 1.428571
19.0 May 3.157143
20.0 May 4.571429
21.0 May 3.257143
22.0 Jun 7.443204
May 14.600000
23.0 Jun 2.618516
24.0 Jun 4.557143
25.0 Jun 31.642857
26.0 Jul 13.759710
Jun 30.500000
... ... ...
29.0 Jul 21.869119
30.0 Aug 48.300000
Jul 27.026809
31.0 Aug 32.975576
32.0 Aug 36.761290
33.0 Aug 51.089862
34.0 Aug 67.218433
35.0 Aug 27.200000
Sep 40.095657
36.0 Sep 27.741261
37.0 Sep 18.205746
38.0 Sep 18.248603
39.0 Oct 4.081006
Sep 0.000000
40.0 Oct 0.071429
41.0 Oct 0.971429
42.0 Oct 2.014286
43.0 Oct 0.042857
44.0 Nov 0.000000
45.0 Nov 0.528571
46.0 Nov 3.614286
47.0 Nov 7.671429
48.0 Dec 0.000000
Nov 0.000000
49.0 Dec 0.071429
50.0 Dec 0.000000
51.0 Dec 0.000000
52.0 Dec 0.000000
Jan 0.166667
53.0 Jan 2.000000

62 rows × 1 columns

In [31]:
df.resample('M').sum().groupby(df['DATETIME'].dt.month.apply(lambda x: calendar.month_abbr[x])).mean() #Monthly
Out[31]:
RAINFALL
DATETIME
Apr 6.083333
Aug 215.800000
Dec 0.071429
Feb 8.700000
Jan 1.528571
Jul 127.812361
Jun 60.888350
Mar 4.633333
May 14.871429
Nov 11.814286
Oct 3.100000
Sep 73.458101
In [33]:
df.groupby(df['DATETIME'].dt.year).sum() #yearly
Out[33]:
RAINFALL
DATETIME
2006 295.258093
2007 510.748603
2008 470.051189
2009 376.500000
2010 776.100000
2011 555.070462
2012 717.600000
In [15]:
df.resample('Y').sum().mean() #overall Mean Rainfall
Out[15]:
RAINFALL    528.761192
dtype: float64