BigQuery and SQL

2019-08-12 | Visits：

Most notes and code are from:
Intro to SQL

Introduction
- Table schema
Big Datasets
- Estimate the size of query
- Limit scan data
Techniques
- SELECT FROM
- WHERE
- More queries
- COUNT
- GROUP BY
- HAVING
- ORDER BY
- Dates
- EXTRACT
- WITH AS
- JOIN ON

Introduction

from google.clound import bigquery  

# Create a 'Client' object
client = bigquery.Client()  

# Construct a reference to the 'hacker_news' dataset
dataset_ref = client.dataset('hacker_news', project = 'bigquery-public-data')  

# API request - fetch the dataset
dataset = clinet.get_dataset(dataset_ref)  

# List all the tables in the 'Hacker_news' dataset
tables = list(client.list_tables(dataset))  

# Print names of all tables in the dataset
for table in tables:  
	print (table.table_id)  

Similar to fetch a dataset, also can fetch a table

# Construct a reference to the 'full' table
table_ref = dataset_ref.table('full')  

# API request - fetch the table
table = client.get_table(table_ref)  

# Preview the first five lines of the 'full' table (.head)  
client.list_rows(table, max_results = 5).to_dataframe()  

Table schema

Print information on all the columns in the “full” table in the “hacker_news” dataset

table.schema  

Each SchemaField tells us about the specific column which contains ‘name’, ‘fiel type’, ‘mode’, and ‘description’
list_rows() is to show the lines of the table and converts to a pandas DataFrame with to_dataframe() mehtod

# Preciew the first five lines of the 'full' table
client.list_rows(table, max_results=5).to_dataframe()

# Preview the first five entries in the 'by' column of the 'full' table
client.list_rows(table, selected_fields=table.schema[:1],max_results=5).to_dataframe()  

Big Datasets

Estimate the size of query

# Query to get the score column from every row where the type column has value "job"
query = """
        SELECT score, title
        FROM `bigquery-public-data.hacker_news.full`
        WHERE type = "job" 
        """

# Create a QueryJobConfig object to estimate size of query without running it
dry_run_config = bigquery.QueryJobConfig(dry_run=True)

# API request - dry run query to estimate costs
dry_run_query_job = client.query(query, job_config=dry_run_config)

print("This query will process {} bytes.".format(dry_run_query_job.total_bytes_processed))

Limit scan data

# Only run the query if it's less than 1 MB
ONE_MB = 1000*1000

# Also can increase to 1 GB
# ONE_GB = 1000*1000*1000
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=ONE_MB)

# Set up the query (will only run if it's less than 1 MB)
safe_query_job = client.query(query, job_config=safe_config)

# API request - try to run the query, and return a pandas DataFrame
safe_query_job.to_dataframe()

Techniques

SELECT FROM

# to select the Name column (from the full table in the hacker_news database in the bigquery-public-data project)
query = """
        SELECT Name  
        FROM 'bigquery-public-data.hacker_news.full'  
        """  
        # The triple quotation marks makes everything inside them be a single string  

WHERE

# Get the Name column which the News about Google
query = """
        SELECT score  
        FROM `bigquery-public-data.hacker_news.full`  
        WHERE type = "job"
        """

# Set up the query
query_job = client.query(query)  

# API request - run the query, and return a pandas DataFrame
us_cities = query_job.to_dataframe()  

# Then we can use any other DataFrame

# The five cities have the most measurements
us_cities.city.value_counts().head()  

More queries

# want multiple columns
query = """
        SELECT score, title
        """
# Wannt all columns
query = """
        SELECT *  
        """ 

COUNT

query = """
    SELECT COUNT(ID) # counts total number of news  
    FROM `bigquery-public-data.hacker_news.full`  
    """

Others aggregate functions: SUM(), AVG(), MIN(), MAX()…
COUNT(1) count the rows in each group

GROUP BY

query = """  
    SELECT score, COUNT(ID) # counts the numbers of each score  
    FROM `bigquery-public-data.hacker_news.full`  
    GROUP BY score  
    """

Note that all variables must be passed to either
1. GROUP BY command, or
2. An aggregation function
If any variables isn’t passed to either one, the error message will show up SELECT list expression references column (column's name) which is neither grouped nor aggregated at

HAVING

query = """  
    SELECT score, COUNT(ID) # counts the numbers of each score  
    FROM `bigquery-public-data.hacker_news.full`  
    GROUP BY score  
    HAVING COUNT(ID)>1 # output the table of score which larger than one  
    """

ORDER BY

query = """
    SELECT ID, score, title  
    FROM `bigquery-public-data.hacker_news.full`  
    ORDER BY ID # Columns with ID, score, title and with ID's order  
    # Text will show up with alphabetical order  
    # ORDER BY title DESC  
    # DESC argument (short for 'descending')  
    """

Dates

DATE
- YYYY-[M]M-[D]D
- YYYY: Four-digit year
- [M]M: One or two digit month
- [D]D: One or two digit day
- 2019-08-12 is interpreted as August 12, 2019
DATETIME
- Date with time added at the end

EXTRACT

Day

query = """  
  SELECT title, EXTRACT(DAY from Date) AS Day
  FROM `bigquery-public-data.hacker_news.full`  

Week
- WEEK
- DAYOFWEEK 1 (Snday) and 7(Saturday)

query = """  
    SELECT COUNT(score) AS score,  
        EXTRACT(DAYOFWEEK FROM Date) AS day_of_week  
    FROM `bigquery-public-data.hacker_news.full`  
    GROUP BY day_of_week  
    ORDER BY score DESC  

WITH AS

query = """
               WITH with_query AS
               (
                   SELECT trip_seconds, trip_miles, EXTRACT(HOUR FROM trip_start_timestamp) AS hour_of_day
                   FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
                   WHERE trip_start_timestamp >'2017-01-01' and
                       trip_start_timestamp < '2017-07-01' and
                       trip_seconds > 0 and
                       trip_miles > 0
               )
               SELECT hour_of_day,
                   count(1) AS num_trips,
                   3600 * SUM(trip_miles)/SUM(trip_seconds) AS avg_mph
               FROM with_query
               GROUP BY hour_of_day
               ORDER BY hour_of_day
               """

JOIN ON

query = """  
    SELECT f.title AS full_title, c.author AS author  
    FROM `bigquery-public-data.hacker_news.full` AS f  
    INNER JOIN `bigquery-public-data.hacker_news.comments` AS c  
        ON f.author = c.author  
    GROUP BY f.title  
    ORDER BY date DESC