In this article, We have explore generating Analytics from a Product based Company using Web Log. Even by using small data, We could still gain a lot of valuable insights.

Problem Statement: Generate Analytics based on the data in Hadoop Eco-system:
- Load weblog data into HDFS using HDFS client
- Develop Pig program to load log and perform analytics on IP Category-1 Category-2 page, status_code
2.1. Count of page views by individual user ie [IP, count(*)]
2.2. Top / Bottom 2: catagery-1/ catagery-2 / page /users (Exclude status code other than 200)
Top 2 and bottom 2 records
- Category, total_number_views
- page, total_number_views
- IP, total_number_of_views
2.3. Total page views / Category wise pageviews / Unique pageviews
- page,total_number_of_views
- category, total_views
- page, total_number_of_unique_views
2.4. Count of status code = 200 / 404 / 400 / 500
- status_code, count
- Load results into tables in MySql Database using Sqoop.

Attribute Information or Dataset Details:
Data: Input Format – Text

Technology Used
- Apache Hadoop (HDFS)
- Apache Pig
- Apache Hive
- MySQL
- Shell Script
- Apache Sqoop
- Linux
Apache Pig Code
Pig
x
103
1
weblog.pig
2
3
/* Develop Pig program to extract data for the following KPIs */
4
5
FILE1 = LOAD '/home/bhavesh/weblog.txt' using PigStorage(',') as (IP:chararray,Category1:chararray,Category2:chararray,page:int,statuscode:int);
6
GRPD = group FILE1 by IP;
7
IPCOUNT = foreach GRPD generate group,COUNT(FILE1.IP);
8
9
/* Count of page views by individual user ie IP, count(*) */
10
STORE IPCOUNT into '/home/bhavesh/Count_by_Pageview_by_IndividualUser/' using PigStorage(',');
11
12
/*(Exclude status code other than 200) */
13
14
FILE_STATUS_CODE_200 = FILTER FILE1 by statuscode == 200;
15
16
/*
17
Top 2 and bottom 2 records
18
Category, total_number_views
19
page, total_number_views
20
IP, total_number_of_views
21
*/
22
23
GRPD_CATEGORY1 = group FILE_STATUS_CODE_200 by Category1;
24
CNT_FOR_CATEGORY1 = FOREACH GRPD_CATEGORY1 generate group,COUNT(FILE_STATUS_CODE_200.Category1) as COUNTING;
25
26
SORTED_CATEGORY1_DEC = ORDER CNT_FOR_CATEGORY1 by COUNTING DESC;
27
TOP_2_CATEGORY1 = limit SORTED_CATEGORY1_DEC 2;
28
STORE TOP_2_CATEGORY1 into '/home/bhavesh/Top2_By_Category1/' using PigStorage(',');
29
30
SORTED_CATEGORY1_ASC = ORDER CNT_FOR_CATEGORY1 by COUNTING ASC;
31
BOTTOM_2_CATEGORY1 = limit SORTED_CATEGORY1_ASC 2;
32
STORE BOTTOM_2_CATEGORY1 into '/home/bhavesh/Bottom2_By_Category1/' using PigStorage(',');
33
34
GRPD_CATEGORY2 = group FILE_STATUS_CODE_200 by Category2;
35
CNT_FOR_CATEGORY2 = FOREACH GRPD_CATEGORY2 generate group,COUNT(FILE_STATUS_CODE_200.Category2) as COUNTING;
36
37
SORTED_CATEGORY2_DEC = ORDER CNT_FOR_CATEGORY2 by COUNTING DESC;
38
TOP_2_CATEGORY2 = limit SORTED_CATEGORY2_DEC 2;
39
STORE TOP_2_CATEGORY2 into '/home/bhavesh/Top2_By_Category2/' using PigStorage(',');
40
41
SORTED_CATEGORY2_ASC = ORDER CNT_FOR_CATEGORY2 by COUNTING ASC;
42
BOTTOM_2_CATEGORY2 = limit SORTED_CATEGORY2_ASC 2;
43
STORE BOTTOM_2_CATEGORY2 into '/home/bhavesh/Bottom2_By_Category2/' using PigStorage(',');
44
45
GRPD_PAGES = group FILE_STATUS_CODE_200 by page;
46
CNT_FOR_PAGE = FOREACH GRPD_PAGES generate group,COUNT(FILE_STATUS_CODE_200.page) as COUNTING;
47
48
SORTED_PAGE_DEC = ORDER CNT_FOR_PAGE by COUNTING DESC;
49
TOP_2_PAGE = limit SORTED_PAGE_DEC 2;
50
STORE TOP_2_PAGE into '/home/bhavesh/Top2_By_PAGE/' using PigStorage(',');
51
52
SORTED_PAGE_ASC = ORDER CNT_FOR_PAGE by COUNTING ASC;
53
BOTTOM_2_PAGE = limit SORTED_PAGE_ASC 2;
54
STORE BOTTOM_2_PAGE into '/home/bhavesh/Bottom2_By_PAGE/' using PigStorage(',');
55
56
GRPD_IP = group FILE_STATUS_CODE_200 by IP;
57
CNT_FOR_IP = FOREACH GRPD_IP generate group,COUNT(FILE_STATUS_CODE_200.IP) as COUNTING;
58
59
SORTED_IP_DEC = ORDER CNT_FOR_IP by COUNTING DESC;
60
TOP_2_IP = limit SORTED_IP_DEC 2;
61
STORE TOP_2_IP into '/home/bhavesh/Top2_By_IP/' using PigStorage(',');
62
63
SORTED_IP_ASC = ORDER CNT_FOR_IP by COUNTING ASC;
64
BOTTOM_2_IP = limit SORTED_IP_ASC 2;
65
STORE BOTTOM_2_IP into '/home/bhavesh/Bottom2_By_IP/' using PigStorage(',');
66
67
/* Total page views / Category wise pageviews / Unique pageviews
68
page,total_number_of_views
69
category, total_views
70
page, total_number_of_unique_views */
71
72
GRPD_TOTALPAGES = group FILE1 by page;
73
CNT_FOR_TOTALPAGE = FOREACH GRPD_TOTALPAGES generate group,COUNT(FILE1.page) as COUNTING;
74
SORTED_PAGES_DEC = ORDER CNT_FOR_TOTALPAGE by COUNTING DESC;
75
STORE SORTED_PAGES_DEC into '/home/bhavesh/Total_PAGES_Count/' using PigStorage(',');
76
77
GRPD_TOTALCATEGORY1 = group FILE1 by Category1;
78
CNT_FOR_TOTALCATEGORY1 = FOREACH GRPD_TOTALCATEGORY1 generate group,COUNT(FILE1.page) as COUNTING;
79
SORTED_TOTALCATEGORY1_DEC = ORDER CNT_FOR_TOTALCATEGORY1 by COUNTING DESC;
80
STORE SORTED_TOTALCATEGORY1_DEC into '/home/bhavesh/Total_Category1_Count/' using PigStorage(',');
81
82
GRPD_TOTALCATEGORY2 = group FILE1 by Category2;
83
CNT_FOR_TOTALCATEGORY2 = FOREACH GRPD_TOTALCATEGORY2 generate group,COUNT(FILE1.page) as COUNTING;
84
SORTED_TOTALCATEGORY2_DEC = ORDER CNT_FOR_TOTALCATEGORY2 by COUNTING DESC;
85
STORE SORTED_TOTALCATEGORY2_DEC into '/home/bhavesh/Total_Category2_Count/' using PigStorage(',');
86
87
GRPD_TOTALPAGES_UNIQUEVIEW = group FILE1 by page;
88
CNT_FOR_TOTALPAGE_UNIQUEVIEW = FOREACH GRPD_TOTALPAGES_UNIQUEVIEW {
89
internet_protocol = FILE1.IP;
90
unique_internet_protocol = DISTINCT internet_protocol;
91
GENERATE group, COUNT(unique_internet_protocol);
92
};
93
STORE CNT_FOR_TOTALPAGE_UNIQUEVIEW into '/home/bhavesh/Page_Total_Number_Of_Unique_Views/' using PigStorage(',');
94
95
/*
96
Count of status code = 200 / 404 / 400 / 500
97
status_code, count
98
*/
99
100
GRPD = group FILE1 by statuscode;
101
STATUS_CODE_COUNT = foreach GRPD generate group,COUNT(FILE1.statuscode);
102
103
STORE STATUS_CODE_COUNT into '/home/bhavesh/Status_Code_Count/' using PigStorage(',');
Shell Script
Shell
1
267
1
#######################################################################
2
############################# COMPLETE SCRIPT #######################
3
### HEADER - PROGRAM NAME - <weblog.sh>
4
### AUTHOR - BHAVESH BHADRICHA
5
### DATE - 11/DEC/2015
6
### VERSION - 1.0
7
### DESCRIPTION - Data: It comprises of the information gathered from websites
8
### which contains IP, Two Categories of Product, Pages and Status Code
9
### ###
10
### Problem Statement: Analyse the data in Hadoop Eco-system to:
11
### 1.Load data into HDFS using HDFS client
12
### ###
13
### 2. Develop PIG program to parse WEB logs and meaning full result from it
14
### INUPT file Format
15
### IP,Category-1,Category-2,page status_code
16
### ###
17
### PIG program to extract data for the following
18
### ###
19
### 3. Count of page views by individual user
20
### IP, count(*) ###
21
### ###
22
### 4. Top / Bottom 5: catagery-1/ catagery-2 / page /users
23
### (Exclude status code other than 200)
24
### ###
25
### Top 5 and bottom 5 records ###
26
### Category, total_number_views ###
27
### page, total_number_views ###
28
### IP, total_number_of_views ###
29
### ###
30
### 5. Total page views / Category wise pageviews / Unique pageviews ###
31
### ###
32
### page,total_number_of_views ###
33
### category, total_views ###
34
### page, total_number_of_unique_views ###
35
### ###
36
### 6. Count of status code = 200 / 404 / 400 / 500 ###
37
### status_code, count ###
38
### ###
39
### 7. Load results into tables in MySql Database using Sqoop.? ###
40
#####################################################################
41
#####################################################################
42
#####################################################################
43
###DEFINING THE LOCAL VARIABLES###
44
######################################################################
45
DATE=$(date +"%Y%m%d_%H%M%S")
46
LOGFILE="/home/bhavesh/POC/WEBLOG_POC/LOG/"$DATE".log"
47
#####################################################################
48
### Load data into HDFS using HDFS client #########################
49
#####################################################################
50
51
hadoop fs -put weblog.txt /home/bhavesh/weblog.txt
52
53
########################## PIG Processing ###############################
54
#### PIG, which splits the data into two parts: Category data and Ratings data ###
55
#########################################################################
56
57
echo "Pig Script starts here"
58
59
echo "PIG Script,Weblog Processing" >> $LOGFILE
60
61
hadoop fs -rmr /home/bhavesh/Total_Category1_Count
62
hadoop fs -rmr /home/bhavesh/Bottom2_By_PAGE
63
hadoop fs -rmr /home/bhavesh/Top2_By_Category1
64
hadoop fs -rmr /home/bhavesh/Top2_By_IP
65
hadoop fs -rmr /home/bhavesh/Total_PAGES_Count
66
hadoop fs -rmr /home/bhavesh/Bottom2_By_IP
67
hadoop fs -rmr /home/bhavesh/Bottom2_By_Category2
68
hadoop fs -rmr /home/bhavesh/Status_Code_Count
69
hadoop fs -rmr /home/bhavesh/Count_by_Pageview_by_IndividualUser
70
hadoop fs -rmr /home/bhavesh/Page_Total_Number_Of_Unique_Views
71
hadoop fs -rmr /home/bhavesh/Total_Category2_Count
72
hadoop fs -rmr /home/bhavesh/Bottom2_By_Category1
73
hadoop fs -rmr /home/bhavesh/Top2_By_Category2
74
hadoop fs -rmr /home/bhavesh/Top2_By_PAGE
75
76
pig /home/bhavesh/POC/WEBLOG_POC/weblog.pig
77
78
if [ $? -eq 0 ]; then
79
echo "Succesfully finished PIG Processing " >> $LOGFILE
80
else
81
echo "PIG Processing Failed Please check the Log " >> $LOGFILE
82
fi
83
84
85
############################ HIVE Processing ###############################
86
###### HIVE will load the Category data and Rating Data into Hive Tables ##########
87
#############################################################################
88
89
echo "HIVE Script starts here"
90
91
echo "HIVE LOAD data into Table " >> $LOGFILE
92
93
hive -e 'drop table if exists TotalCategory1Count';
94
hive -e 'drop table if exists Bottom2ByPAGE';
95
hive -e 'drop table if exists Top2ByCategory1';
96
hive -e 'drop table if exists Top2ByIP';
97
hive -e 'drop table if exists TotalPAGESCount';
98
hive -e 'drop table if exists Bottom2ByIP';
99
hive -e 'drop table if exists Bottom2ByCategory2';
100
hive -e 'drop table if exists StatusCodeCount';
101
hive -e 'drop table if exists CountbyPageviewbyIndividualUser';
102
hive -e 'drop table if exists PageTotalNumberOfUniqueViews';
103
hive -e 'drop table if exists TotalCategory2Count';
104
hive -e 'drop table if exists Bottom2ByCategory1';
105
hive -e 'drop table if exists Top2ByCategory2';
106
hive -e 'drop table if exists Top2ByPAGE';
107
108
hive -e "create external table TotalCategory1Count
109
(Category1 string,
110
countings int)
111
row format delimited
112
fields terminated by','
113
lines terminated by '\n'
114
stored as textfile location '/home/bhavesh/hive/TotalCategory1Count'";
115
116
hive -e "create external table Bottom2ByPAGE
117
(Pages int,
118
countings int)
119
row format delimited
120
fields terminated by','
121
lines terminated by '\n'
122
stored as textfile location '/home/bhavesh/hive/Bottom2ByPAGE'";
123
124
hive -e "create external table Top2ByCategory1
125
(Category1 string,
126
countings int)
127
row format delimited
128
fields terminated by','
129
lines terminated by '\n'
130
stored as textfile location '/home/bhavesh/hive/Top2ByCategory1'";
131
132
133
hive -e "create external table Top2ByIP
134
(IP string,
135
countings int)
136
row format delimited
137
fields terminated by','
138
lines terminated by '\n'
139
stored as textfile location '/home/bhavesh/hive/Top2ByIP'";
140
141
hive -e "create external table TotalPAGESCount
142
(PAGES int,
143
countings int)
144
row format delimited
145
fields terminated by','
146
lines terminated by '\n'
147
stored as textfile location '/home/bhavesh/hive/TotalPAGESCount'";
148
149
hive -e "create external table Bottom2ByIP
150
(IP string,
151
countings int)
152
row format delimited
153
fields terminated by','
154
lines terminated by '\n'
155
stored as textfile location '/home/bhavesh/hive/Bottom2ByIP'";
156
157
hive -e "create external table Bottom2ByCategory2
158
(Category2 string,
159
countings int)
160
row format delimited
161
fields terminated by','
162
lines terminated by '\n'
163
stored as textfile location '/home/bhavesh/hive/Bottom2ByCategory2'";
164
165
hive -e "create external table StatusCodeCount
166
(StatusCode int,
167
countings int)
168
row format delimited
169
fields terminated by','
170
lines terminated by '\n'
171
stored as textfile location '/home/bhavesh/hive/StatusCodeCount'";
172
173
hive -e "create external table CountbyPageviewbyIndividualUser
174
(IP string,
175
countings int)
176
row format delimited
177
fields terminated by','
178
lines terminated by '\n'
179
stored as textfile location '/home/bhavesh/hive/CountbyPageviewbyIndividualUser'";
180
181
hive -e "create external table PageTotalNumberOfUniqueViews
182
(page int,
183
countings int)
184
row format delimited
185
fields terminated by','
186
lines terminated by '\n'
187
stored as textfile location '/home/bhavesh/hive/PageTotalNumberOfUniqueViews'";
188
189
hive -e "create external table TotalCategory2Count
190
(Category2 string,
191
countings int)
192
row format delimited
193
fields terminated by','
194
lines terminated by '\n'
195
stored as textfile location '/home/bhavesh/hive/TotalCategory2Count'";
196
197
hive -e "create external table Bottom2ByCategory1
198
(Category1 string,
199
countings int)
200
row format delimited
201
fields terminated by','
202
lines terminated by '\n'
203
stored as textfile location '/home/bhavesh/hive/Bottom2ByCategory1'";
204
205
hive -e "create external table Top2ByCategory2
206
(Category2 string,
207
countings int)
208
row format delimited
209
fields terminated by','
210
lines terminated by '\n'
211
stored as textfile location '/home/bhavesh/hive/Top2ByCategory2'";
212
213
hive -e "create external table Top2ByPAGE
214
(page int,
215
countings int)
216
row format delimited
217
fields terminated by','
218
lines terminated by '\n'
219
stored as textfile location '/home/bhavesh/hive/Top2ByPAGE'";
220
221
hive -e "load data inpath '/home/bhavesh/Total_Category1_Count/part-r-00000' overwrite into table TotalCategory1Count";
222
hive -e "load data inpath '/home/bhavesh/Bottom2_By_PAGE/part-r-00000' overwrite into table Bottom2ByPAGE";
223
hive -e "load data inpath '/home/bhavesh/Top2_By_Category1/part-r-00000' overwrite into table Top2ByCategory1";
224
hive -e "load data inpath '/home/bhavesh/Top2_By_IP/part-r-00000' overwrite into table Top2ByIP";
225
hive -e "load data inpath '/home/bhavesh/Total_PAGES_Count/part-r-00000' overwrite into table TotalPAGESCount";
226
hive -e "load data inpath '/home/bhavesh/Bottom2_By_IP/part-r-00000' overwrite into table Bottom2ByIP";
227
hive -e "load data inpath '/home/bhavesh/Bottom2_By_Category2/part-r-00000' overwrite into table Bottom2ByCategory2";
228
hive -e "load data inpath '/home/bhavesh/Status_Code_Count/part-r-00000' overwrite into table StatusCodeCount";
229
hive -e "load data inpath '/home/bhavesh/Count_by_Pageview_by_IndividualUser/part-r-00000' overwrite into table CountbyPageviewbyIndividualUser";
230
hive -e "load data inpath '/home/bhavesh/Page_Total_Number_Of_Unique_Views/part-r-00000' overwrite into table PageTotalNumberOfUniqueViews";
231
hive -e "load data inpath '/home/bhavesh/Total_Category2_Count/part-r-00000' overwrite into table TotalCategory2Count";
232
hive -e "load data inpath '/home/bhavesh/Bottom2_By_Category1/part-r-00000' overwrite into table Bottom2ByCategory1";
233
hive -e "load data inpath '/home/bhavesh/Top2_By_Category2/part-r-00000' overwrite into table Top2ByCategory2";
234
hive -e "load data inpath '/home/bhavesh/Top2_By_PAGE/part-r-00000' overwrite into table Top2ByPAGE";
235
236
237
############################ SQOOP Processing ##############################
238
###### Pushing the HIVE Tale data into RDBMS Tables via SQOOP #######################
239
#########################################################################
240
241
sqoop export --connect jdbc:mysql://localhost/mysql --username root --password root --table TotalCategory1Count --export-dir /home/bhavesh/hive/TotalCategory1Count/part-r-00000 --input-fields-terminated-by ',';
242
243
sqoop export --connect jdbc:mysql://localhost/mysql --username root --password root --table Bottom2ByPAGE --export-dir /home/bhavesh/hive/Bottom2ByPAGE/part-r-00000 --input-fields-terminated-by ',';
244
245
sqoop export --connect jdbc:mysql://localhost/mysql --username root --password root --table Top2ByCategory1 --export-dir /home/bhavesh/hive/Top2ByCategory1/part-r-00000 --input-fields-terminated-by ',';
246
247
sqoop export --connect jdbc:mysql://localhost/mysql --username root --password root --table Top2ByIP --export-dir /home/bhavesh/hive/Top2ByIP/part-r-00000 --input-fields-terminated-by ',';
248
249
sqoop export --connect jdbc:mysql://localhost/mysql --username root --password root --table TotalPAGESCount --export-dir /home/bhavesh/hive/TotalPAGESCount/part-r-00000 --input-fields-terminated-by ',';
250
251
sqoop export --connect jdbc:mysql://localhost/mysql --username root --password root --table Bottom2ByIP --export-dir /home/bhavesh/hive/Bottom2ByIP/part-r-00000 --input-fields-terminated-by ',';
252
253
sqoop export --connect jdbc:mysql://localhost/mysql --username root --password root --table Bottom2ByCategory2 --export-dir /home/bhavesh/hive/Bottom2ByCategory2/part-r-00000 --input-fields-terminated-by ',';
254
255
sqoop export --connect jdbc:mysql://localhost/mysql --username root --password root --table StatusCodeCount --export-dir /home/bhavesh/hive/StatusCodeCount/part-r-00000 --input-fields-terminated-by ',';
256
257
sqoop export --connect jdbc:mysql://localhost/mysql --username root --password root --table CountbyPageviewbyIndividualUser --export-dir /home/bhavesh/hive/CountbyPageviewbyIndividualUser/part-r-00000 --input-fields-terminated-by ',';
258
259
sqoop export --connect jdbc:mysql://localhost/mysql --username root --password root --table PageTotalNumberOfUniqueViews --export-dir /home/bhavesh/hive/PageTotalNumberOfUniqueViews/part-r-00000 --input-fields-terminated-by ',';
260
261
sqoop export --connect jdbc:mysql://localhost/mysql --username root --password root --table TotalCategory2Count --export-dir /home/bhavesh/hive/TotalCategory2Count/part-r-00000 --input-fields-terminated-by ',';
262
263
sqoop export --connect jdbc:mysql://localhost/mysql --username root --password root --table Bottom2ByCategory1 --export-dir /home/bhavesh/hive/Bottom2ByCategory1/part-r-00000 --input-fields-terminated-by ',';
264
265
sqoop export --connect jdbc:mysql://localhost/mysql --username root --password root --table Top2ByCategory2 --export-dir /home/bhavesh/hive/Top2ByCategory2/part-r-00000 --input-fields-terminated-by ',';
266
267
sqoop export --connect jdbc:mysql://localhost/mysql --username root --password root --table Top2ByPAGE --export-dir /home/bhavesh/hive/Top2ByPAGE/part-r-00000 --input-fields-terminated-by ',';