Google Trend Crawler
Google trend provides a time-series search index of a search keyword. The search index shows how many times keywords are searched. It is important to note that search index is not exact search volume. Google masked real search volume and provide rescaled search index which is bounded between 0 and 100. Even though it is not real search volume, it provides a meaningful information. It is one of the best predictor variables for market demands, Word-of-Mouth, etc.
The following two figures compare the pattern of real iPhone sales and google search index. We can see that the google search index remarkably well explains the real sales of iPhone. This is because majority of consumers search "iPhone" before they buy it.
You can retrieve search index by time and region. Moreover, you can compare search indexes of multiple keywords. This feature enable us to predict demand of multiple durable goods at the same time.
However, manually extracting search index of large number of keywords would be tedious. I developed an automated google trend crawler based on python. You should be careful. If you continuously and frequently sent a lot of queries to google server, then google server might ban your IP.
Here is the key component of the python google index crawler. Enjoy!
class pyGoogleTrendsCrawler(object):
def __init__(self, username, password): self.user_agent_list = ["Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", "Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.2; SV1; .NET CLR 3.3.69573; WOW64; en-US)", "Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"] self.login_params = { "continue": 'http://www.google.com/trends', "PersistentCookie": "yes", "Email": username, "Passwd": password, } self.input_file = "input.xls" self.output_file = "google_trends.xlsx" self.sheet_name = "google_trends" self.headers = [("Referrer", "https://www.google.com/accounts/ServiceLoginBoxAuth"), ("Content-type", "application/x-www-form-urlencoded"), ('User-Agent', random.choice(self.user_agent_list)), ("Accept", "text/plain")] self.url_ServiceLoginBoxAuth = 'https://accounts.google.com/ServiceLoginBoxAuth' self.url_Export = 'http://www.google.com/trends/viz' self.url_CookieCheck = 'https://www.google.com/accounts/CheckCookie?chtml=LoginDoneHtml' self.url_PrefCookie = 'http://www.google.com' self._connect() def _connect(self): self.cj = CookieJar() self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj)) self.opener.addheaders = self.headers #galx = re.compile('<input type="hidden" name="GALX" value="(?P<galx>[a-zA-Z0-9_-]+)">') galx = re.compile('<input name="GALX" type="hidden" value="(?P<galx>[a-zA-Z0-9_-]+)">')
resp = self.opener.open(self.url_ServiceLoginBoxAuth).read() resp = re.sub(r'\s\s+', ' ', resp) m = galx.search(resp) if not m: raise Exception("Cannot parse GALX out of login page") self.login_params['GALX'] = m.group('galx') params = urllib.urlencode(self.login_params) self.opener.open(self.url_ServiceLoginBoxAuth, params) self.opener.open(self.url_CookieCheck) self.opener.open(self.url_PrefCookie)
def _send_email(self, text): to_addresses = [] cc_addresses = [] msg=MIMEMultipart() msg['From']= self.login_params["Email"] msg['To']=', '.join(to_addresses) msg['Cc'] = ', '.join(cc_addresses) msg['Subject']=u"google trends crawler error report" msg.attach(MIMEText(text, 'plain', _charset="utf-8")) mailServer=smtplib.SMTP("smtp.gmail.com",587) mailServer.ehlo() mailServer.starttls() mailServer.ehlo() mailServer.login(self.login_params["Email"],self.login_params["Passwd"]) mailServer.sendmail(self.login_params["Email"], to_addresses+cc_addresses, msg.as_string()) mailServer.close()
def query(self, keywords, date='all', geo='all'): params = {'q': ",".join(keywords), 'date':date, 'geo':geo} delkeys = [] for k, v in params.iteritems(): if(v == "all"): delkeys.append(k) for key in delkeys: del params[key] params = urllib.urlencode(params) url="http://www.google.com/trends/fetchComponent?" + params +"&cid=TIMESERIES_GRAPH_0&export=3" print url try: html = "" json_string = "" response = self.opener.open(url); html = response.read() print html html = re.sub(r'new Date\([0-9\,]*\)', "0", html) print html m = re.search("google.visualization.Query.setResponse\( (.*?) \)", html) json_string = m.group(1) json_string = re.sub(r',{"c":[{"v":0,"f":"[0-9a-zA-Z ]*"},{2,4}{"v":null}]}',"",json_string)
dictionary = json.loads(json_string) if(dictionary["status"] != "ok"): for i in xrange(len(keywords)): keywords[i] = "\""+re.sub(r'\"',"",keywords[i])+"\"" json_string = "{ \"keywords\" : ["+ ",".join(keywords) +"]}" return json_string except Exception as e: error_report = {'error_code' : str(type(e)), 'url' : url, 'html' : html, 'json_string' : json_string } self._send_email(str(error_report)) print error_report
def json_decode_data(self, jsondata): google_trends_data_row=[] dictionary = json.loads(jsondata) if("table" in dictionary): rows = dictionary['table']['rows'] #print rows num_keywords = len(dictionary['table']['cols']) -1 #print num_keywords for i in xrange(num_keywords): google_trends_data_row.append([dictionary['table']['cols'][i+1]['label']]) for row in rows: rowc = row['c']
for i in xrange(num_keywords): google_trends_data_row[i].append(rowc[i+1]['v']) print google_trends_data_row else: num_keywords = len(dictionary['keywords']) for i in xrange(num_keywords): google_trends_data_row.append([dictionary['keywords'][i]])
return google_trends_data_row
def json_decode_date(self, jsondata): dates_row = ["Search_pair","KeywordFinal","type","index","index2","Brand","Product_name"] dictionary = json.loads(jsondata)
rows = dictionary['table']['rows'] num_keywords = len(dictionary['table']['cols']) -1 for row in rows: rowc = row['c'] date = rowc[0]['f'] if(is_number(date[0]) == False): date_obj = datetime.strptime(date, '%B %Y') dates_row.append(date_obj.strftime("SI_%Y%m")) else: p = re.compile('\d+') date_array = p.findall(date) if(int(date_array[1]) < 10): date_array[1] = "0" + str(date_array[1]) dates_row.append("SI_"+date_array[0]+date_array[1]) print dates_row
return dates_row def insert_data_to_xlsx(self, google_trends_data): for data in google_trends_data: self.ws.append(data)
def insert_dates_to_xlsx(self, dates): self.ws.append(dates) def _load_keywords(self): wb = xlrd.open_workbook(self.input_file) ws = wb.sheet_by_name(wb.sheet_names()[0]) highest_row = ws.nrows series_data = {} product_data = {} for i in xrange(1,highest_row): type = ws.cell_value(i,1) data = ws.cell_value(i,0) brand = ws.cell_value(i,4) product_name = ws.cell_value(i,5) if(type == "series"): index = ws.cell_value(i,3) series_data[index] = [data,brand,product_name] elif(type == "product"): index = ws.cell_value(i,2) product_data[index] = [data, brand, product_name] return [series_data, product_data] def make_xlsx(self): self.wb = Workbook() self.ws = self.wb.create_sheet(0) self.ws.title = self.sheet_name def save_xlsx(self): self.wb.save(self.output_file) def save_temp_xlsx(self, count): filenameset = self.output_file.split(".") self.wb.save(filenameset[0]+str(count)+"."+filenameset[1])