구글 트렌드 파이썬 api

2016. 4. 25. 20:55·IT기술 관련/파이썬_루비 등 언어
반응형

출처 : https://github.com/GeneralMills/pytrends/blob/master/pytrends/pyGTrends.py


from __future__ import absolute_import, print_function, unicode_literals

import copy
import csv
from datetime import datetime
from io import open
import re
import sys
import requests
import json
from fake_useragent import UserAgent
if sys.version_info[0] == 2:  # Python 2
    from cookielib import CookieJar
    from cStringIO import StringIO
    from urllib import urlencode
    from urllib import quote
    from urllib2 import build_opener, HTTPCookieProcessor
else:  # Python 3
    from http.cookiejar import CookieJar
    from io import StringIO
    from urllib.parse import urlencode
    from urllib.parse import quote
    from urllib.request import build_opener, HTTPCookieProcessor


class pyGTrends(object):
    """
    Google Trends API
    """
    def __init__(self, username, password):
        """
        Initialize hard-coded URLs, HTTP headers, and login parameters
        needed to connect to Google Trends, then connect.
        """
        self.login_params = {
            'continue': 'http://www.google.com/trends',
            'PersistentCookie': 'yes',
            'Email': username,
            'Passwd': password}
        # provide fake user agent to look like a desktop brower
        self.fake_ua = UserAgent()
        self.headers = [
            ('Referrer', 'https://www.google.com/accounts/ServiceLoginBoxAuth'),
            ('Content-type', 'application/x-www-form-urlencoded'),
            ('User-Agent', self.fake_ua.chrome),
            ('Accept', 'text/plain')]
        self.url_ServiceLoginBoxAuth = 'https://accounts.google.com/ServiceLoginBoxAuth'
        self.url_Export = 'http://www.google.com/trends/trendsReport'
        self.url_CookieCheck = 'https://www.google.com/accounts/CheckCookie?chtml=LoginDoneHtml'
        self.url_PrefCookie = 'http://www.google.com'
        self._connect()

    def _connect(self):
        """
        Connect to Google Trends. Use cookies.
        """
        self.cj = CookieJar()
        self.opener = build_opener(HTTPCookieProcessor(self.cj))
        self.opener.addheaders = self.headers

        resp = self.opener.open(self.url_ServiceLoginBoxAuth).read()
        resp = re.sub(r'\s\s+', ' ', resp.decode(encoding='utf-8'))

        galx = re.compile('<input name="GALX"[\s]+type="hidden"[\s]+value="(?P<galx>[a-zA-Z0-9_-]+)">')
        m = galx.search(resp)
        if not m:
            galx = re.compile('<input type="hidden"[\s]+name="GALX"[\s]+value="(?P<galx>[a-zA-Z0-9_-]+)">')
            m = galx.search(resp)
            if not m:
                raise Exception('Cannot parse GALX out of login page')

        self.login_params['GALX'] = m.group('galx')
        params = urlencode(self.login_params).encode('utf-8')
        self.opener.open(self.url_ServiceLoginBoxAuth, params)
        self.opener.open(self.url_CookieCheck)
        self.opener.open(self.url_PrefCookie)

    def request_report(self, keywords, hl='en-US', cat=None, geo=None, date=None, tz=None, gprop=None):
        query_param = 'q=' + quote(keywords)

        # This logic handles the default of skipping parameters
        # Parameters that are set to '' will not filter the data requested.
        # See Readme.md for more information
        if cat is not None:
            cat_param = '&cat=' + cat
        else:
            cat_param = ''
        if date is not None:
            date_param = '&date=' + quote(date)
        else:
            date_param = ''
        if geo is not None:
            geo_param = '&geo=' + geo
        else:
            geo_param = ''
        if tz is not None:
            tz_param = '&tz=' + tz
        else:
            tz_param = ''
        if gprop is not None:
            gprop_param = '&gprop=' + gprop
        else:
            gprop_param = ''
        hl_param = '&hl=' + hl

        # These are the default parameters and shouldn't be changed.
        cmpt_param = "&cmpt=q"
        content_param = "&content=1"
        export_param = "&export=1"

        combined_params = query_param + cat_param + date_param + geo_param + hl_param + tz_param + cmpt_param \
                          + content_param + export_param + gprop_param

        print("Now downloading information for:")
        print("http://www.google.com/trends/trendsReport?" + combined_params)

        raw_data = self.opener.open("http://www.google.com/trends/trendsReport?" + combined_params).read()
        self.decode_data = raw_data.decode('utf-8')

        if self.decode_data in ["You must be signed in to export data from Google Trends"]:
            print("You must be signed in to export data from Google Trends")
            raise Exception(self.decode_data)

    def save_csv(self, path, trend_name):
        fileName = path + trend_name + ".csv"
        with open(fileName, mode='wb') as f:
            f.write(self.decode_data.encode('utf8'))

    def get_data(self):
        return self.decode_data

    def get_suggestions(self, keyword):
        kw_param = quote(keyword)
        raw_data = self.opener.open("https://www.google.com/trends/api/autocomplete/" + kw_param).read()
        # response is invalid json but if you strip off ")]}'," from the front it is then valid
        json_data = json.loads(raw_data[5:].decode())
        return json_data


def parse_data(data):
    """
    Parse data in a Google Trends CSV export (as `str`) into JSON format
    with str values coerced into appropriate Python-native objects.

    Parameters
    ----------
    data : str
        CSV data as text, output by `pyGTrends.get_data()`

    Returns
    -------
    parsed_data : dict of lists
        contents of `data` parsed into JSON form with appropriate Python types;
        sub-tables split into separate dict items, keys are sub-table "names",
        and data values parsed according to type, e.g.
        '10' => 10, '10%' => 10, '2015-08-06' => `datetime.datetime(2015, 8, 6, 0, 0)`
    """
    parsed_data = {}
    for i, chunk in enumerate(re.split(r'\n{2,}', data)):
        if i == 0:
            match = re.search(r'^(.*?) interest: (.*)\n(.*?); (.*?)$', chunk)
            if match:
                source, query, geo, period = match.groups()
                parsed_data['info'] = {'source': source, 'query': query,
                                       'geo': geo, 'period': period}
        else:
            chunk = _clean_subtable(chunk)
            rows = [row for row in csv.reader(StringIO(chunk)) if row]
            if not rows:
                continue
            label, parsed_rows = _parse_rows(rows)
            if label in parsed_data:
                parsed_data[label+'_1'] = parsed_data.pop(label)
                parsed_data[label+'_2'] = parsed_rows
            else:
                parsed_data[label] = parsed_rows

    return parsed_data


def _clean_subtable(chunk):
    """
    The data output by Google Trends is human-friendly, not machine-friendly;
    this function fixes a couple egregious data problems.
    1. Google replaces rising search percentages with "Breakout" if the increase
    is greater than 5000%: https://support.google.com/trends/answer/4355000 .
    For parsing's sake, we set it equal to that high threshold value.
    2. Rising search percentages between 1000 and 5000 have a comma separating
    the thousands, which is terrible for CSV data. We strip it out.
    """
    chunk = re.sub(r',Breakout', ',5000%', chunk)
    chunk = re.sub(r'(,[+-]?[1-4]),(\d{3}%\n)', r'\1\2', chunk)
    return chunk


def _infer_dtype(val):
    """
    Using regex, infer a limited number of dtypes for string `val`
    (only dtypes expected to be found in a Google Trends CSV export).
    """
    if re.match(r'\d{4}-\d{2}(?:-\d{2})?', val):
        return 'date'
    elif re.match(r'[+-]?\d+$', val):
        return 'int'
    elif re.match(r'[+-]?\d+%$', val):
        return 'pct'
    elif re.match(r'[a-zA-Z ]+', val):
        return 'text'
    else:
        msg = "val={0} dtype not recognized".format(val)
        raise ValueError(msg)


def _convert_val(val, dtype):
    """
    Convert string `val` into Python-native object according to its `dtype`:
    '10' => 10, '10%' => 10, '2015-08-06' => `datetime.datetime(2015, 8, 6, 0, 0)`,
    ' ' => None, 'foo' => 'foo'
    """
    if not val.strip():
        return None
    elif dtype == 'date':
        match = re.match(r'(\d{4}-\d{2}-\d{2})', val)
        if match:
            return datetime.strptime(match.group(), '%Y-%m-%d')
        else:
            return datetime.strptime(re.match(r'(\d{4}-\d{2})', val).group(), '%Y-%m')
    elif dtype == 'int':
        return int(val)
    elif dtype == 'pct':
        return int(val[:-1])
    else:
        return val


def _parse_rows(rows, header='infer'):
    """
    Parse sub-table `rows` into JSON form and convert str values into appropriate
    Python types; if `header` == `infer`, will attempt to infer if header row
    in rows, otherwise pass True/False.
    """
    if not rows:
        raise ValueError('rows={0} is invalid'.format(rows))
    rows = copy.copy(rows)
    label = rows[0][0].replace(' ', '_').lower()

    if header == 'infer':
        if len(rows) >= 3:
            if _infer_dtype(rows[1][-1]) != _infer_dtype(rows[2][-1]):
                header = True
            else:
                header = False
        else:
            header = False
    if header is True:
        colnames = rows[1]
        data_idx = 2
    else:
        colnames = None
        data_idx = 1

    data_dtypes = [_infer_dtype(val) for val in rows[data_idx]]
    if any(dd == 'pct' for dd in data_dtypes):
        label += '_pct'

    parsed_rows = []
    for row in rows[data_idx:]:
        vals = [_convert_val(val, dtype) for val, dtype in zip(row, data_dtypes)]
        if colnames:
            parsed_rows.append({colname:val for colname, val in zip(colnames, vals)})
        else:
            parsed_rows.append(vals)

    return label, parsed_rows

예제 코드




반응형
저작자표시 (새창열림)

'IT기술 관련 > 파이썬_루비 등 언어' 카테고리의 다른 글

python - Reading Excel sheet with xlrd (xlrd를 이용한 파이썬 엑셀 파일 읽기)  (0) 2016.09.12
python - pxssh를 이용한 ssh connect & send command  (0) 2016.09.12
Windows pip에서 UnicodeDecodeError 발생  (0) 2016.04.25
python pip 설치  (0) 2016.04.25
[Python] 파이썬 악성코드  (0) 2015.12.09
'IT기술 관련/파이썬_루비 등 언어' 카테고리의 다른 글
  • python - Reading Excel sheet with xlrd (xlrd를 이용한 파이썬 엑셀 파일 읽기)
  • python - pxssh를 이용한 ssh connect & send command
  • Windows pip에서 UnicodeDecodeError 발생
  • python pip 설치
호레
호레
창업 / IT / 육아 / 일상 / 여행
    반응형
  • 호레
    Unique Life
    호레
  • 전체
    오늘
    어제
    • 분류 전체보기
      • 법률
        • 기본
        • 개인정보보호법
        • 정보통신망법
        • 전자금융거래법
        • 전자금융감독규정
        • 신용정보법
        • 온라인투자연계금융업법
      • 창업
        • 외식업 관련
        • 임대업 관련
        • 유통업 관련
        • 세무 관련
        • 마케팅 관련
        • 기타 지식
        • 트렌드
        • Youtube
      • IT기술 관련
        • 모바일
        • 윈도우
        • 리눅스
        • MAC OS
        • 네트워크
        • 빅데이터 관련
        • A.I 인공지능
        • 파이썬_루비 등 언어
        • 쿠버네티스
        • 기타 기술
      • 퍼블릭 클라우드 관련
        • Azure
        • GCP
        • AWS
      • 정보보안 관련
        • QRadar
        • Splunk
        • System
        • Web
      • 기타
        • 세상 모든 정보
        • 서적
      • 게임 관련
        • 유니티
      • 부동산
      • 맛집 찾기
        • 강남역
        • 양재역
        • 판교역
        • ★★★★★
        • ★★★★
        • ★★★
        • ★★
        • ★
      • 결혼_육아 생활
        • 리얼후기
        • 일상
        • 육아
        • 사랑
        • Food
      • 영어
        • 스피킹
        • 문법
        • 팝송
        • 영화
      • K-컨텐츠
        • 드라마
        • 영화
        • 예능
      • 독서
      • 프로젝트 관련 조사
        • 시스템 구축
        • 로그 관련
        • 웹
        • APT
        • 모의 해킹
        • DB
        • 허니팟
        • 수리카타
        • 알고리즘
        • FDS
      • 기업별 구내 식당 평가
        • 한국관광공사
        • KT telecop
        • KT M&S
        • KT powertel
        • KT cs 연수원
        • 진에어
      • 대학 생활
        • 위드윈연구소
        • 진로 고민
        • 채용정보
        • 자동차
        • 주식
        • 악성코드
        • 게임 보안
      • 쉐어하우스
  • 블로그 메뉴

    • 홈
    • 게임 관련
    • IT 기술 관련
    • 태그
  • 링크

  • 공지사항

  • 인기 글

  • 태그

    맛집
    런치
    판교역
    점심
    보안가이드
    복리후생
    AWS
    상호관세
    대통령
    유니티
    수제버거
    수제버거맛집
    판교
    쥬쥬랜드
    이재곧죽습니다
    돈까스
    무역전쟁
    수제버거존맛
    마케팅
    판교맛집
  • 최근 댓글

  • 최근 글

  • hELLO· Designed By정상우.v4.10.0
호레
구글 트렌드 파이썬 api
상단으로

티스토리툴바