Most notes and code are from:
Intro to SQL
Introduction
from google.clound import bigquery
# Create a 'Client' object
client = bigquery.Client()
# Construct a reference to the 'hacker_news' dataset
dataset_ref = client.dataset('hacker_news', project = 'bigquery-public-data')
# API request - fetch the dataset
dataset = clinet.get_dataset(dataset_ref)
# List all the tables in the 'Hacker_news' dataset
tables = list(client.list_tables(dataset))
# Print names of all tables in the dataset
for table in tables:
print (table.table_id)
- Similar to fetch a dataset, also can fetch a table
# Construct a reference to the 'full' table
table_ref = dataset_ref.table('full')
# API request - fetch the table
table = client.get_table(table_ref)
# Preview the first five lines of the 'full' table (.head)
client.list_rows(table, max_results = 5).to_dataframe()
Table schema
- Print information on all the columns in the “full” table in the “hacker_news” dataset
table.schema
-
Each
SchemaField
tells us about the specific column which contains ‘name’, ‘fiel type’, ‘mode’, and ‘description’ -
list_rows()
is to show the lines of the table and converts to a pandas DataFrame withto_dataframe()
mehtod
# Preciew the first five lines of the 'full' table
client.list_rows(table, max_results=5).to_dataframe()
# Preview the first five entries in the 'by' column of the 'full' table
client.list_rows(table, selected_fields=table.schema[:1],max_results=5).to_dataframe()
Big Datasets
Estimate the size of query
# Query to get the score column from every row where the type column has value "job"
query = """
SELECT score, title
FROM `bigquery-public-data.hacker_news.full`
WHERE type = "job"
"""
# Create a QueryJobConfig object to estimate size of query without running it
dry_run_config = bigquery.QueryJobConfig(dry_run=True)
# API request - dry run query to estimate costs
dry_run_query_job = client.query(query, job_config=dry_run_config)
print("This query will process {} bytes.".format(dry_run_query_job.total_bytes_processed))
Limit scan data
# Only run the query if it's less than 1 MB
ONE_MB = 1000*1000
# Also can increase to 1 GB
# ONE_GB = 1000*1000*1000
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=ONE_MB)
# Set up the query (will only run if it's less than 1 MB)
safe_query_job = client.query(query, job_config=safe_config)
# API request - try to run the query, and return a pandas DataFrame
safe_query_job.to_dataframe()
Techniques
SELECT FROM
# to select the Name column (from the full table in the hacker_news database in the bigquery-public-data project)
query = """
SELECT Name
FROM 'bigquery-public-data.hacker_news.full'
"""
# The triple quotation marks makes everything inside them be a single string
WHERE
# Get the Name column which the News about Google
query = """
SELECT score
FROM `bigquery-public-data.hacker_news.full`
WHERE type = "job"
"""
# Set up the query
query_job = client.query(query)
# API request - run the query, and return a pandas DataFrame
us_cities = query_job.to_dataframe()
# Then we can use any other DataFrame
# The five cities have the most measurements
us_cities.city.value_counts().head()
More queries
# want multiple columns
query = """
SELECT score, title
"""
# Wannt all columns
query = """
SELECT *
"""
COUNT
query = """
SELECT COUNT(ID) # counts total number of news
FROM `bigquery-public-data.hacker_news.full`
"""
-
Others aggregate functions: SUM(), AVG(), MIN(), MAX()…
-
COUNT(1)
count the rows in each group
GROUP BY
query = """
SELECT score, COUNT(ID) # counts the numbers of each score
FROM `bigquery-public-data.hacker_news.full`
GROUP BY score
"""
- Note that all variables must be passed to either
GROUP BY
command, or- An aggregation function
- If any variables isn’t passed to either one, the error message will show up
SELECT list expression references column (column's name) which is neither grouped nor aggregated at
HAVING
query = """
SELECT score, COUNT(ID) # counts the numbers of each score
FROM `bigquery-public-data.hacker_news.full`
GROUP BY score
HAVING COUNT(ID)>1 # output the table of score which larger than one
"""
ORDER BY
query = """
SELECT ID, score, title
FROM `bigquery-public-data.hacker_news.full`
ORDER BY ID # Columns with ID, score, title and with ID's order
# Text will show up with alphabetical order
# ORDER BY title DESC
# DESC argument (short for 'descending')
"""
Dates
- DATE
YYYY-[M]M-[D]D
YYYY
: Four-digit year[M]M
: One or two digit month[D]D
: One or two digit day2019-08-12
is interpreted as August 12, 2019
- DATETIME
- Date with time added at the end
EXTRACT
- Day
query = """ SELECT title, EXTRACT(DAY from Date) AS Day FROM `bigquery-public-data.hacker_news.full`
- Week
WEEK
DAYOFWEEK
1 (Snday) and 7(Saturday)
query = """
SELECT COUNT(score) AS score,
EXTRACT(DAYOFWEEK FROM Date) AS day_of_week
FROM `bigquery-public-data.hacker_news.full`
GROUP BY day_of_week
ORDER BY score DESC
WITH AS
query = """
WITH with_query AS
(
SELECT trip_seconds, trip_miles, EXTRACT(HOUR FROM trip_start_timestamp) AS hour_of_day
FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
WHERE trip_start_timestamp >'2017-01-01' and
trip_start_timestamp < '2017-07-01' and
trip_seconds > 0 and
trip_miles > 0
)
SELECT hour_of_day,
count(1) AS num_trips,
3600 * SUM(trip_miles)/SUM(trip_seconds) AS avg_mph
FROM with_query
GROUP BY hour_of_day
ORDER BY hour_of_day
"""
JOIN ON
query = """
SELECT f.title AS full_title, c.author AS author
FROM `bigquery-public-data.hacker_news.full` AS f
INNER JOIN `bigquery-public-data.hacker_news.comments` AS c
ON f.author = c.author
GROUP BY f.title
ORDER BY date DESC