VOC Analysis
Volatile Organic Compounds (VOC) analysis workflow, including OFP and SOAP calculation.
Data Preparation
Note
RawDataReader('VOC', ...) is deprecated — the VOC reader is a thin CSV
loader. Read the file directly with pandas and pass the DataFrame to
voc_potentials, which validates species against support_voc.json.
import pandas as pd
from AeroViz import voc_potentials
# Read VOC data (datetime index in column 0; '-' / 'N.D.' treated as NA)
voc = pd.read_csv('./data/voc/voc.csv', index_col=0, parse_dates=True,
na_values=('-', 'N.D.'))
voc.columns = voc.columns.str.strip()
# View available species
print(voc.columns.tolist())
# ['Ethane', 'Propane', 'Butane', 'Benzene', 'Toluene', ...]
OFP/SOAP Calculation
Basic Calculation
# Calculate VOC reactivity potentials
result = voc_potentials(voc)
# Returns a dict with four DataFrames (each time-indexed):
# result['Conc'] - mass concentration (ug/m3)
# result['OFP'] - Ozone Formation Potential (ug O3/m3)
# result['SOAP'] - Secondary Organic Aerosol Potential
# result['LOH'] - OH-reactivity (loss rate)
# Each frame's columns are the individual species PLUS per-class totals
# (e.g. 'aromatic_total', 'alkane_total') and a grand 'Total' column.
df_ofp = result['OFP']
df_soap = result['SOAP']
# Grand totals live in the 'Total' column of each frame:
print(f"Mean total OFP: {df_ofp['Total'].mean():.1f} ug O3/m3")
print(f"Mean total SOAP: {df_soap['Total'].mean():.2f}")
Species Contribution Analysis
Top 10 Contributing Species
# Drop the aggregate columns ('Total' and the per-class '*_total' columns)
# so we rank only individual species.
species_cols = [c for c in df_ofp.columns
if c != 'Total' and not c.endswith('_total')]
# Calculate mean OFP contribution for each species
mean_ofp = df_ofp[species_cols].mean().sort_values(ascending=False)
# Top 10
top10 = mean_ofp.head(10)
print("Top 10 OFP contributors:")
for species, value in top10.items():
print(f" {species}: {value:.1f} ug O3/m3")
Species Contribution Ratio
import matplotlib.pyplot as plt
# Calculate contribution ratio (use the 'Total' column as the denominator,
# and only individual species in the numerator)
contribution = df_ofp[species_cols].div(df_ofp['Total'], axis=0) * 100
# Mean contribution ratio
mean_contrib = contribution.mean().sort_values(ascending=False)
# Plot pie chart (Top 8 + Others)
top8 = mean_contrib.head(8)
others = mean_contrib[8:].sum()
plot_data = pd.concat([top8, pd.Series({'Others': others})])
fig, ax = plt.subplots(figsize=(10, 8))
ax.pie(plot_data, labels=plot_data.index, autopct='%1.1f%%')
ax.set_title('OFP Contribution by Species')
plt.show()
Analysis by Chemical Category
# Define chemical categories
categories = {
'Alkanes': ['Ethane', 'Propane', 'Butane', 'Pentane', 'Hexane'],
'Alkenes': ['Ethene', 'Propene', 'Butene', 'Isoprene'],
'Aromatics': ['Benzene', 'Toluene', 'Ethylbenzene', 'Xylene'],
'OVOCs': ['Formaldehyde', 'Acetaldehyde', 'Acetone']
}
# Calculate OFP for each category
category_ofp = {}
for cat, species in categories.items():
# Find matching columns
cols = [c for c in df_ofp.columns if any(s in c for s in species)]
if cols:
category_ofp[cat] = df_ofp[cols].sum(axis=1)
df_cat_ofp = pd.DataFrame(category_ofp)
# Plot stacked bar chart
fig, ax = plt.subplots(figsize=(12, 6))
df_cat_ofp.resample('D').mean().plot(kind='bar', stacked=True, ax=ax)
ax.set_ylabel('OFP (ug O3/m3)')
ax.set_title('Daily OFP by Chemical Category')
plt.tight_layout()
plt.show()
Temporal Variation Analysis
Diurnal Variation
# Calculate diurnal variation of total OFP
total_ofp = df_ofp['Total']
hourly = total_ofp.groupby(total_ofp.index.hour).agg(['mean', 'std'])
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(hourly.index, hourly['mean'], 'b-o')
ax.fill_between(hourly.index,
hourly['mean'] - hourly['std'],
hourly['mean'] + hourly['std'],
alpha=0.3)
ax.set_xlabel('Hour')
ax.set_ylabel('OFP (ug O3/m3)')
ax.set_title('Diurnal Variation of Total OFP')
ax.set_xticks(range(0, 24, 3))
plt.show()
Seasonal Variation
# Monthly average
monthly = df_ofp['Total'].resample('ME').mean()
fig, ax = plt.subplots(figsize=(10, 5))
monthly.plot(kind='bar', ax=ax)
ax.set_ylabel('OFP (ug O3/m3)')
ax.set_title('Monthly Average OFP')
plt.show()
OFP vs SOAP Relationship
total_ofp = result['OFP']['Total']
total_soap = result['SOAP']['Total']
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(total_ofp, total_soap, alpha=0.5)
ax.set_xlabel('Total OFP (ug O3/m3)')
ax.set_ylabel('Total SOAP')
ax.set_title('OFP vs SOAP Relationship')
# Correlation
from scipy import stats
mask = total_ofp.notna() & total_soap.notna()
r, p = stats.pearsonr(total_ofp[mask], total_soap[mask])
ax.text(0.05, 0.95, f'r = {r:.3f}\np < 0.001' if p < 0.001 else f'r = {r:.3f}\np = {p:.3f}',
transform=ax.transAxes, va='top')
plt.show()
Complete Analysis Script
import pandas as pd
import matplotlib.pyplot as plt
from AeroViz import voc_potentials
# 1. Read VOC data with pandas (datetime index in column 0)
voc = pd.read_csv('./data/voc/voc.csv', index_col=0, parse_dates=True,
na_values=('-', 'N.D.'))
voc.columns = voc.columns.str.strip()
# 2. Calculate OFP/SOAP
result = voc_potentials(voc)
df_ofp = result['OFP'] # columns: species + '*_total' classes + 'Total'
df_soap = result['SOAP']
total_ofp = df_ofp['Total'] # grand total OFP, time-indexed Series
total_soap = df_soap['Total']
# 3. Species contribution analysis (rank individual species only)
species_cols = [c for c in df_ofp.columns
if c != 'Total' and not c.endswith('_total')]
mean_ofp = df_ofp[species_cols].mean().sort_values(ascending=False)
print("=== Top 5 OFP Contributors ===")
for i, (species, value) in enumerate(mean_ofp.head(5).items(), 1):
pct = value / total_ofp.mean() * 100
print(f"{i}. {species}: {value:.1f} ug O3/m3 ({pct:.1f}%)")
# 4. Output results summary
print("\n=== VOC Analysis Summary ===")
print(f"Total OFP: {total_ofp.mean():.1f} +/- {total_ofp.std():.1f} ug O3/m3")
print(f"Total SOAP: {total_soap.mean():.2f} +/- {total_soap.std():.2f}")
# 5. Plot contribution chart
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# OFP contribution
top10_ofp = mean_ofp.head(10)
axes[0].barh(range(10), top10_ofp.values)
axes[0].set_yticks(range(10))
axes[0].set_yticklabels(list(top10_ofp.index))
axes[0].set_xlabel('OFP (ug O3/m3)')
axes[0].set_title('Top 10 OFP Contributors')
# Diurnal variation
hourly = total_ofp.groupby(total_ofp.index.hour).mean()
axes[1].plot(hourly.index, hourly.values, 'b-o')
axes[1].set_xlabel('Hour')
axes[1].set_ylabel('OFP (ug O3/m3)')
axes[1].set_title('Diurnal Pattern')
plt.tight_layout()
plt.savefig('voc_analysis.png', dpi=300)