分数表 scores,字段:id, subject, score
找出每一科都是前30%的学生ID。
造数据:
import pandas as pd
import numpy as np
import random# 设置随机种子保证结果可重现
np.random.seed(42)def generate_score_data(num_students=50, num_subjects=5):"""生成学生成绩模拟数据"""# 定义学科名称subjects = ['数学', '英语', '科学', '历史', '艺术']# 生成学生IDstudent_ids = [f'S{i:03d}' for i in range(1, num_students + 1)]# 创建空的数据框data = []# 为每个学生生成每科成绩for student_id in student_ids:for subject in subjects:# 使用正态分布生成成绩,均值为70,标准差为15,并限制在0-100之间score = np.random.normal(70, 15)score = max(0, min(100, int(score))) # 限制在0-100范围内data.append([student_id, subject, score])# 创建DataFramedf = pd.DataFrame(data, columns=['id', 'subject', 'score'])return df, subjects# 生成数据
scores_df, subject_list = generate_score_data()print("前10条数据示例:")
print(scores_df.head(10))
sql:
WITH subject_total AS (SELECT subject, COUNT(*) AS totalFROM scoresGROUP BY subject
),
student_subjects AS (SELECT id, COUNT(*) AS total_subjectsFROM scoresGROUP BY id
),
ranked_scores AS (SELECT id, subject, RANK() OVER (PARTITION BY subject ORDER BY score DESC) AS rankFROM scores
)
SELECT r.id
FROM ranked_scores r
JOIN subject_total st ON r.subject = st.subject
JOIN student_subjects s ON r.id = s.id
WHERE r.rank <= st.total * 0.3
GROUP BY r.id
HAVING COUNT(*) = s.total_subjects;